import sys
def run_with_argv(main_fn, argv):
old_argv = sys.argv[:]
sys.argv = argv
try:
main_fn()
finally:
sys.argv = old_argvWalmart Weekly Sales Forecasting: Model Comparison
1 Objective
This report compares six forecasting approaches for weekly Walmart sales:
- Local Linear Anomaly Model (Phase 1).
- Bayesian Structural AR (Phase 2, full data).
- Elastic Net anomaly baseline (Phase 3).
- Random Forest anomaly baseline (Phase 4).
- ETS anomaly baseline (Phase 5).
- AdaBoost anomaly baseline (Phase 7).
Additionally, we include a Phase 1-lite diagnostic variant in the final comparison to show the impact of removing the extra exogenous variables from Local Linear.
Evaluation uses forward-chaining validation and weighted MAE (WMAE).
2 Notebook Training Cells (Self-Contained)
These cells are self-contained: they do not import local script files. Run the definition cell for a model, then run its training cell.
2.1 Runner Helper
2.2 Phase 1 Definitions
from __future__ import annotations
import argparse
import json
from collections import deque
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import pandas as pd
TARGET_COL = "sales_anom"
LAG_ORDERS = (1, 2)
LAG_FEATURE_COLS = [f"sales_anom_lag{lag}" for lag in LAG_ORDERS]
FULL_EXOG_FEATURE_COLS = [
"temp_anom",
"fuel_anom",
"MarkDown1",
"MarkDown2",
"MarkDown3",
"MarkDown4",
"MarkDown5",
"CPI",
"Unemployment",
]
LITE_EXOG_FEATURE_COLS = [
"temp_anom",
"fuel_anom",
]
CORE_FEATURE_COLS = [
"sales_anom_lag1",
"sales_anom_lag2",
"is_holiday_int",
]
INTERACTION_FEATURE_COLS = [
"holiday_x_lag1",
]
WEEK_PERIOD = 53
@dataclass(slots=True)
class LocalLinearConfig:
kernel: str
bandwidth: int
min_samples: int
ridge: float
coef_clip: float
anom_clip_scale: float
use_interactions: bool
feature_mode: str
@dataclass(slots=True)
class FittedLocalLinear:
local_params: dict[tuple[str, int], np.ndarray]
available_weeks: dict[str, np.ndarray]
series_fallback: dict[str, np.ndarray]
global_fallback: np.ndarray
series_bounds: dict[str, tuple[float, float]]
global_bounds: tuple[float, float]
feature_cols: list[str]
feature_means: np.ndarray
feature_stds: np.ndarray
def get_feature_cols(config: LocalLinearConfig) -> list[str]:
cols = list(get_exog_feature_cols(config)) + list(CORE_FEATURE_COLS)
if config.use_interactions:
cols.extend(INTERACTION_FEATURE_COLS)
return cols
def get_exog_feature_cols(config: LocalLinearConfig) -> list[str]:
if config.feature_mode == "lite":
return list(LITE_EXOG_FEATURE_COLS)
return list(FULL_EXOG_FEATURE_COLS)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Phase 1: Local linear regression on anomalies with forward-chaining CV."
)
parser.add_argument("--train-path", default="train_feat.parquet")
parser.add_argument("--test-path", default="test_feat.parquet")
parser.add_argument("--output-dir", default="outputs")
parser.add_argument("--kernel", choices=["tricube", "exponential"], default="tricube")
parser.add_argument("--bandwidth", type=int, default=6, help="Neighborhood width in weeks.")
parser.add_argument("--min-samples", type=int, default=16)
parser.add_argument("--ridge", type=float, default=1e-4)
parser.add_argument("--coef-clip", type=float, default=6.0, help="Absolute clip for non-intercept coefficients.")
parser.add_argument(
"--anom-clip-scale",
type=float,
default=2.0,
help="How wide recursive anomaly clipping bounds are around train quantiles.",
)
parser.add_argument("--n-folds", type=int, default=4)
parser.add_argument("--val-weeks", type=int, default=13)
parser.add_argument("--max-series", type=int, default=None, help="Optional debug subset.")
parser.add_argument(
"--use-interactions",
action=argparse.BooleanOptionalAction,
default=False,
help="Use targeted interaction term (holiday x lag1).",
)
parser.add_argument(
"--feature-mode",
choices=["full", "lite"],
default="full",
help="`full` uses all exogenous features; `lite` keeps only temp/fuel exogenous features.",
)
return parser.parse_args()
def weighted_ridge_fit(X: np.ndarray, y: np.ndarray, weights: np.ndarray, ridge: float) -> np.ndarray:
if X.size == 0:
return np.zeros(X.shape[1] + 1, dtype=float)
design = np.column_stack([np.ones(X.shape[0], dtype=float), X])
sqrt_w = np.sqrt(np.clip(weights, 0.0, None))
Xw = design * sqrt_w[:, None]
yw = y * sqrt_w
reg = np.eye(Xw.shape[1], dtype=float) * ridge
reg[0, 0] = 0.0
xtx = Xw.T @ Xw + reg
xty = Xw.T @ yw
beta = np.linalg.pinv(xtx) @ xty
return beta.astype(float, copy=False)
def regularize_beta(beta: np.ndarray, coef_clip: float) -> np.ndarray:
out = beta.copy()
out[1:] = np.clip(out[1:], -coef_clip, coef_clip)
return out
def cyclic_week_distance(weeks: np.ndarray, target_week: int) -> np.ndarray:
delta = np.abs(weeks - target_week)
return np.minimum(delta, WEEK_PERIOD - delta)
def kernel_weights(distance: np.ndarray, bandwidth: int, kernel: str) -> np.ndarray:
if kernel == "tricube":
scaled = np.clip(distance / float(bandwidth), 0.0, 1.0)
weights = (1.0 - scaled**3) ** 3
weights[distance > bandwidth] = 0.0
return weights
if kernel == "exponential":
return np.exp(-0.5 * (distance / float(bandwidth)) ** 2)
raise ValueError(f"Unsupported kernel: {kernel}")
def robust_bounds(values: np.ndarray, clip_scale: float) -> tuple[float, float]:
if values.size == 0:
return (-1.0, 1.0)
q01, q99 = np.quantile(values, [0.01, 0.99])
std = float(np.std(values))
span = max(float(q99 - q01), std, 1.0)
lower = float(q01 - clip_scale * span)
upper = float(q99 + clip_scale * span)
if lower >= upper:
center = float(np.median(values))
lower, upper = center - 1.0, center + 1.0
return (lower, upper)
def fit_standard_scaler(X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
means = np.mean(X, axis=0)
stds = np.std(X, axis=0)
bad = (~np.isfinite(stds)) | (stds < 1e-8)
stds[bad] = 1.0
means[~np.isfinite(means)] = 0.0
return means.astype(float, copy=False), stds.astype(float, copy=False)
def apply_standard_scaler(X: np.ndarray, means: np.ndarray, stds: np.ndarray) -> np.ndarray:
return (X - means) / stds
def load_data(train_path: Path, test_path: Path, max_series: int | None) -> tuple[pd.DataFrame, pd.DataFrame]:
train = pd.read_parquet(train_path)
test = pd.read_parquet(test_path)
for frame in (train, test):
frame["Date"] = pd.to_datetime(frame["Date"], errors="coerce")
frame["Store"] = frame["Store"].astype(int)
frame["Dept"] = frame["Dept"].astype(int)
frame["series_id"] = frame["Store"].astype(str) + "_" + frame["Dept"].astype(str)
frame["week_of_year"] = frame["Date"].dt.isocalendar().week.astype(int)
frame["is_holiday_int"] = frame["IsHoliday"].astype(int)
if max_series is not None:
keep_ids = (
train["series_id"]
.drop_duplicates()
.sort_values()
.head(max_series)
.tolist()
)
train = train[train["series_id"].isin(keep_ids)].copy()
test = test[test["series_id"].isin(keep_ids)].copy()
train = train.sort_values(["Date", "Store", "Dept"]).reset_index(drop=True)
test = test.sort_values(["Date", "Store", "Dept"]).reset_index(drop=True)
return train, test
def build_forward_folds(
train_df: pd.DataFrame, n_folds: int, val_weeks: int
) -> list[tuple[int, np.ndarray, np.ndarray]]:
unique_dates = np.array(sorted(train_df["Date"].dropna().unique()))
initial_train = len(unique_dates) - (n_folds * val_weeks)
if initial_train <= 0:
raise ValueError(
f"Not enough unique weeks ({len(unique_dates)}) for n_folds={n_folds}, val_weeks={val_weeks}."
)
folds: list[tuple[int, np.ndarray, np.ndarray]] = []
for fold in range(n_folds):
train_end = initial_train + fold * val_weeks
val_end = train_end + val_weeks
train_dates = unique_dates[:train_end]
val_dates = unique_dates[train_end:val_end]
folds.append((fold + 1, train_dates, val_dates))
return folds
def compute_climatology(train_df: pd.DataFrame) -> dict[str, pd.DataFrame | dict[str, float]]:
series = (
train_df.groupby(["series_id", "week_of_year"], as_index=False)[
["Weekly_Sales", "Temperature", "Fuel_Price"]
]
.median()
.rename(
columns={
"Weekly_Sales": "clim_sales_series",
"Temperature": "clim_temp_series",
"Fuel_Price": "clim_fuel_series",
}
)
)
store = (
train_df.groupby(["Store", "week_of_year"], as_index=False)[
["Weekly_Sales", "Temperature", "Fuel_Price"]
]
.median()
.rename(
columns={
"Weekly_Sales": "clim_sales_store",
"Temperature": "clim_temp_store",
"Fuel_Price": "clim_fuel_store",
}
)
)
global_week = (
train_df.groupby("week_of_year", as_index=False)[["Weekly_Sales", "Temperature", "Fuel_Price"]]
.median()
.rename(
columns={
"Weekly_Sales": "clim_sales_global",
"Temperature": "clim_temp_global",
"Fuel_Price": "clim_fuel_global",
}
)
)
overall = {
"sales": float(train_df["Weekly_Sales"].median()),
"temp": float(train_df["Temperature"].median()),
"fuel": float(train_df["Fuel_Price"].median()),
}
return {"series": series, "store": store, "global_week": global_week, "overall": overall}
def attach_climatology(
df: pd.DataFrame, climatology: dict[str, pd.DataFrame | dict[str, float]]
) -> pd.DataFrame:
out = df.merge(
climatology["series"], on=["series_id", "week_of_year"], how="left" # type: ignore[arg-type]
)
out = out.merge(climatology["store"], on=["Store", "week_of_year"], how="left") # type: ignore[arg-type]
out = out.merge(climatology["global_week"], on=["week_of_year"], how="left") # type: ignore[arg-type]
overall = climatology["overall"] # type: ignore[assignment]
out["clim_sales"] = (
out["clim_sales_series"].fillna(out["clim_sales_store"]).fillna(out["clim_sales_global"]).fillna(overall["sales"])
)
out["clim_temp"] = (
out["clim_temp_series"].fillna(out["clim_temp_store"]).fillna(out["clim_temp_global"]).fillna(overall["temp"])
)
out["clim_fuel"] = (
out["clim_fuel_series"].fillna(out["clim_fuel_store"]).fillna(out["clim_fuel_global"]).fillna(overall["fuel"])
)
out["temp_anom"] = out["Temperature"] - out["clim_temp"]
out["fuel_anom"] = out["Fuel_Price"] - out["clim_fuel"]
if "Weekly_Sales" in out.columns:
out["sales_anom"] = out["Weekly_Sales"] - out["clim_sales"]
drop_cols = [
"clim_sales_series",
"clim_temp_series",
"clim_fuel_series",
"clim_sales_store",
"clim_temp_store",
"clim_fuel_store",
"clim_sales_global",
"clim_temp_global",
"clim_fuel_global",
]
return out.drop(columns=drop_cols, errors="ignore")
def add_anomaly_lags(df: pd.DataFrame) -> pd.DataFrame:
out = df.sort_values(["series_id", "Date"]).copy()
grp = out.groupby("series_id")["sales_anom"]
for lag in LAG_ORDERS:
out[f"sales_anom_lag{lag}"] = grp.shift(lag)
out["holiday_x_lag1"] = out["is_holiday_int"] * out["sales_anom_lag1"]
return out
def fit_local_models(train_df: pd.DataFrame, config: LocalLinearConfig) -> FittedLocalLinear:
feature_cols = get_feature_cols(config)
valid = train_df.copy()
# Keep early rows by treating unavailable long-lag anomalies as neutral (0).
for c in LAG_FEATURE_COLS + INTERACTION_FEATURE_COLS:
if c in valid.columns:
valid[c] = valid[c].fillna(0.0)
for c in get_exog_feature_cols(config):
if c in valid.columns and valid[c].isna().any():
median = float(valid[c].median())
if not np.isfinite(median):
median = 0.0
valid[c] = valid[c].fillna(median)
valid = valid.dropna(subset=feature_cols + [TARGET_COL]).copy()
if valid.empty:
zero_beta = np.zeros(len(feature_cols) + 1, dtype=float)
return FittedLocalLinear(
{},
{},
{},
zero_beta,
{},
(-1.0, 1.0),
feature_cols,
np.zeros(len(feature_cols), dtype=float),
np.ones(len(feature_cols), dtype=float),
)
all_X_raw = valid[feature_cols].to_numpy(dtype=float)
scaler_means, scaler_stds = fit_standard_scaler(all_X_raw)
all_X = apply_standard_scaler(all_X_raw, scaler_means, scaler_stds)
all_y = valid[TARGET_COL].to_numpy(dtype=float)
global_fallback = regularize_beta(
weighted_ridge_fit(
all_X, all_y, np.ones(all_y.shape[0], dtype=float), ridge=config.ridge
),
coef_clip=config.coef_clip,
)
anom_values = train_df[TARGET_COL].dropna().to_numpy(dtype=float)
global_bounds = robust_bounds(anom_values, clip_scale=config.anom_clip_scale)
series_bounds: dict[str, tuple[float, float]] = {}
local_params: dict[tuple[str, int], np.ndarray] = {}
available_weeks: dict[str, np.ndarray] = {}
series_fallback: dict[str, np.ndarray] = {}
grouped = valid.groupby("series_id", sort=False)
for i, (series_id, group) in enumerate(grouped, start=1):
g = group.sort_values("Date")
X_raw = g[feature_cols].to_numpy(dtype=float)
X = apply_standard_scaler(X_raw, scaler_means, scaler_stds)
y = g[TARGET_COL].to_numpy(dtype=float)
weeks = g["week_of_year"].to_numpy(dtype=int)
series_bounds[series_id] = robust_bounds(y, clip_scale=config.anom_clip_scale)
if y.shape[0] >= 3:
series_fallback[series_id] = regularize_beta(
weighted_ridge_fit(X, y, np.ones(y.shape[0], dtype=float), ridge=config.ridge),
coef_clip=config.coef_clip,
)
else:
series_fallback[series_id] = global_fallback
learned_weeks: list[int] = []
for target_week in range(1, WEEK_PERIOD + 1):
distance = cyclic_week_distance(weeks, target_week)
weights = kernel_weights(distance, config.bandwidth, config.kernel)
active = weights > 0.0
if int(active.sum()) < config.min_samples:
continue
beta = regularize_beta(
weighted_ridge_fit(X[active], y[active], weights[active], ridge=config.ridge),
coef_clip=config.coef_clip,
)
local_params[(series_id, target_week)] = beta
learned_weeks.append(target_week)
if learned_weeks:
available_weeks[series_id] = np.array(learned_weeks, dtype=int)
if i % 300 == 0:
print(f" fitted models for {i} series...")
return FittedLocalLinear(
local_params,
available_weeks,
series_fallback,
global_fallback,
series_bounds,
global_bounds,
feature_cols,
scaler_means,
scaler_stds,
)
def _lag_from_history(hist: deque[float], lag: int) -> float:
if len(hist) >= lag:
return float(hist[-lag])
return 0.0
def initialize_history(train_df: pd.DataFrame) -> dict[str, deque[float]]:
history: dict[str, deque[float]] = {}
max_lag = max(LAG_ORDERS)
ordered = train_df.sort_values(["series_id", "Date"])
for series_id, group in ordered.groupby("series_id", sort=False):
values = group["sales_anom"].dropna().to_numpy(dtype=float)
history[series_id] = deque(values.tolist(), maxlen=max_lag)
return history
def pick_beta(model: FittedLocalLinear, series_id: str, week_of_year: int) -> np.ndarray:
direct = model.local_params.get((series_id, week_of_year))
if direct is not None:
return direct
learned = model.available_weeks.get(series_id)
if learned is not None and learned.size > 0:
delta = np.abs(learned - week_of_year)
cyclic = np.minimum(delta, WEEK_PERIOD - delta)
nearest_week = int(learned[np.argmin(cyclic)])
return model.local_params[(series_id, nearest_week)]
series_fb = model.series_fallback.get(series_id)
if series_fb is not None:
return series_fb
return model.global_fallback
def _build_feature_vector(row, hist: deque[float], feature_cols: list[str]) -> np.ndarray:
def safe_float(value: float) -> float:
value = float(value)
if np.isfinite(value):
return value
return 0.0
lag_vals = {
"sales_anom_lag1": _lag_from_history(hist, 1),
"sales_anom_lag2": _lag_from_history(hist, 2),
}
feat = {
"temp_anom": safe_float(row.temp_anom),
"fuel_anom": safe_float(row.fuel_anom),
"MarkDown1": safe_float(row.MarkDown1),
"MarkDown2": safe_float(row.MarkDown2),
"MarkDown3": safe_float(row.MarkDown3),
"MarkDown4": safe_float(row.MarkDown4),
"MarkDown5": safe_float(row.MarkDown5),
"CPI": safe_float(row.CPI),
"Unemployment": safe_float(row.Unemployment),
"is_holiday_int": safe_float(row.is_holiday_int),
**lag_vals,
"holiday_x_lag1": safe_float(row.is_holiday_int) * lag_vals["sales_anom_lag1"],
}
return np.array([feat[c] for c in feature_cols], dtype=float)
def recursive_predict(
future_df: pd.DataFrame, model: FittedLocalLinear, history: dict[str, deque[float]]
) -> pd.DataFrame:
ordered = future_df.sort_values(["Date", "series_id"]).copy()
preds: list[tuple[int, float, float]] = []
max_lag = max(LAG_ORDERS)
for row in ordered.itertuples(index=False):
series_id = row.series_id
hist = history.get(series_id)
if hist is None:
hist = deque(maxlen=max_lag)
history[series_id] = hist
beta = pick_beta(model, series_id, int(row.week_of_year))
x_raw = _build_feature_vector(row, hist, model.feature_cols)
x = apply_standard_scaler(x_raw, model.feature_means, model.feature_stds)
pred_anom = float(beta[0] + np.dot(beta[1:], x))
lower, upper = model.series_bounds.get(series_id, model.global_bounds)
pred_anom = float(np.clip(pred_anom, lower, upper))
pred_sales = float(pred_anom + row.clim_sales)
preds.append((int(row.row_id), pred_sales, pred_anom))
hist.append(pred_anom)
pred_df = pd.DataFrame(preds, columns=["row_id", "pred_sales", "pred_anom"])
return future_df.merge(pred_df, on="row_id", how="left")
def wmae(y_true: np.ndarray, y_pred: np.ndarray, holiday: np.ndarray) -> float:
weights = np.where(holiday, 5.0, 1.0)
return float(np.average(np.abs(y_true - y_pred), weights=weights))
def summarize_metrics(df: pd.DataFrame) -> dict[str, float]:
y_true = df["Weekly_Sales"].to_numpy(dtype=float)
y_pred = df["pred_sales"].to_numpy(dtype=float)
holiday = df["IsHoliday"].to_numpy(dtype=bool)
metrics = {
"wmae": wmae(y_true, y_pred, holiday),
"mae": float(np.mean(np.abs(y_true - y_pred))),
"rmse": float(np.sqrt(np.mean((y_true - y_pred) ** 2))),
"n_rows": float(df.shape[0]),
}
if "sales_anom" in df.columns and "pred_anom" in df.columns:
metrics["anom_mae"] = float(np.mean(np.abs(df["sales_anom"] - df["pred_anom"])))
return metrics
def run_cv(
train_df: pd.DataFrame, config: LocalLinearConfig, n_folds: int, val_weeks: int
) -> tuple[pd.DataFrame, pd.DataFrame]:
folds = build_forward_folds(train_df, n_folds=n_folds, val_weeks=val_weeks)
metrics_rows: list[dict[str, float | int | str]] = []
oof_parts: list[pd.DataFrame] = []
for fold_id, train_dates, val_dates in folds:
print(f"\nFold {fold_id}: train={len(train_dates)} weeks, val={len(val_dates)} weeks")
fold_train = train_df[train_df["Date"].isin(train_dates)].copy()
fold_val = train_df[train_df["Date"].isin(val_dates)].copy()
fold_val["row_id"] = np.arange(fold_val.shape[0], dtype=int)
climatology = compute_climatology(fold_train)
fold_train = add_anomaly_lags(attach_climatology(fold_train, climatology))
model = fit_local_models(fold_train, config)
history = initialize_history(fold_train)
fold_val = attach_climatology(fold_val, climatology)
fold_val_pred = recursive_predict(fold_val, model, history)
fold_metrics = summarize_metrics(fold_val_pred)
fold_metrics["fold"] = fold_id
fold_metrics["train_start"] = str(pd.Timestamp(train_dates[0]).date())
fold_metrics["train_end"] = str(pd.Timestamp(train_dates[-1]).date())
fold_metrics["val_start"] = str(pd.Timestamp(val_dates[0]).date())
fold_metrics["val_end"] = str(pd.Timestamp(val_dates[-1]).date())
metrics_rows.append(fold_metrics)
print(
f" wmae={fold_metrics['wmae']:.4f} mae={fold_metrics['mae']:.4f} "
f"rmse={fold_metrics['rmse']:.4f}"
)
keep_cols = [
"Store",
"Dept",
"Date",
"IsHoliday",
"Weekly_Sales",
"sales_anom",
"pred_sales",
"pred_anom",
]
part = fold_val_pred[keep_cols].copy()
part["fold"] = fold_id
oof_parts.append(part)
metrics_df = pd.DataFrame(metrics_rows)
if not metrics_df.empty:
mean_row = {"fold": "mean"}
for col in ["wmae", "mae", "rmse", "anom_mae", "n_rows"]:
if col in metrics_df.columns:
mean_row[col] = float(metrics_df[col].mean())
metrics_df = pd.concat([metrics_df, pd.DataFrame([mean_row])], ignore_index=True)
oof_df = pd.concat(oof_parts, ignore_index=True) if oof_parts else pd.DataFrame()
return metrics_df, oof_df
def fit_full_and_predict_test(
train_df: pd.DataFrame, test_df: pd.DataFrame, config: LocalLinearConfig
) -> pd.DataFrame:
train_full = train_df.copy()
test_full = test_df.copy()
test_full["row_id"] = np.arange(test_full.shape[0], dtype=int)
climatology = compute_climatology(train_full)
train_full = add_anomaly_lags(attach_climatology(train_full, climatology))
model = fit_local_models(train_full, config)
history = initialize_history(train_full)
test_full = attach_climatology(test_full, climatology)
pred = recursive_predict(test_full, model, history)
return pred
def ensure_dirs(base: Path) -> dict[str, Path]:
paths = {
"root": base,
"metrics": base / "metrics",
"preds": base / "preds",
"submissions": base / "submissions",
}
for p in paths.values():
p.mkdir(parents=True, exist_ok=True)
return paths
def make_submission(pred_df: pd.DataFrame) -> pd.DataFrame:
out = pred_df.copy()
out["Id"] = (
out["Store"].astype(int).astype(str)
+ "_"
+ out["Dept"].astype(int).astype(str)
+ "_"
+ out["Date"].dt.strftime("%Y-%m-%d")
)
submission = out[["Id", "pred_sales"]].rename(columns={"pred_sales": "Weekly_Sales"})
return submission
def main() -> None:
args = parse_args()
config = LocalLinearConfig(
kernel=args.kernel,
bandwidth=args.bandwidth,
min_samples=args.min_samples,
ridge=args.ridge,
coef_clip=args.coef_clip,
anom_clip_scale=args.anom_clip_scale,
use_interactions=args.use_interactions,
feature_mode=args.feature_mode,
)
train_path = Path(args.train_path)
test_path = Path(args.test_path)
output_paths = ensure_dirs(Path(args.output_dir))
print("Loading data...")
train_df, test_df = load_data(train_path, test_path, max_series=args.max_series)
print(
f"train rows={train_df.shape[0]}, test rows={test_df.shape[0]}, "
f"series train={train_df['series_id'].nunique()}, series test={test_df['series_id'].nunique()}"
)
print("\nRunning forward-chaining CV...")
metrics_df, oof_df = run_cv(train_df, config=config, n_folds=args.n_folds, val_weeks=args.val_weeks)
metrics_path = output_paths["metrics"] / "local_linear_cv.csv"
metrics_df.to_csv(metrics_path, index=False)
print(f"Saved CV metrics -> {metrics_path}")
oof_path = output_paths["preds"] / "local_linear_oof.parquet"
oof_df.to_parquet(oof_path, index=False)
print(f"Saved OOF predictions -> {oof_path}")
print("\nTraining on full train and forecasting test recursively...")
test_pred = fit_full_and_predict_test(train_df, test_df, config=config)
test_pred_path = output_paths["preds"] / "local_linear_test.parquet"
test_pred.to_parquet(test_pred_path, index=False)
print(f"Saved test predictions -> {test_pred_path}")
submission = make_submission(test_pred)
submission_path = output_paths["submissions"] / "local_linear_submission.csv"
submission.to_csv(submission_path, index=False)
print(f"Saved submission -> {submission_path}")
config_path = output_paths["metrics"] / "local_linear_config.json"
config_payload = {
"kernel": args.kernel,
"bandwidth": args.bandwidth,
"min_samples": args.min_samples,
"ridge": args.ridge,
"coef_clip": args.coef_clip,
"anom_clip_scale": args.anom_clip_scale,
"lags": list(LAG_ORDERS),
"feature_mode": args.feature_mode,
"exogenous_features": get_exog_feature_cols(config),
"use_interactions": args.use_interactions,
"interaction_cols": INTERACTION_FEATURE_COLS if args.use_interactions else [],
"feature_cols": get_feature_cols(config),
"standard_scaler": True,
"n_folds": args.n_folds,
"val_weeks": args.val_weeks,
"max_series": args.max_series,
}
config_path.write_text(json.dumps(config_payload, indent=2), encoding="utf-8")
print(f"Saved run config -> {config_path}")
phase1_main = main2.3 Phase 1 Full Run
run_with_argv(
phase1_main,
[
"phase1_local_linear",
"--output-dir", "outputs",
"--feature-mode", "full",
"--no-use-interactions",
],
)2.4 Phase 1-lite Run
run_with_argv(
phase1_main,
[
"phase1_local_linear",
"--output-dir", "outputs/phase1_local_linear_lite",
"--feature-mode", "lite",
"--no-use-interactions",
],
)2.5 Baseline Utilities Definitions
from __future__ import annotations
import time
from collections import deque
from pathlib import Path
from typing import Callable
import numpy as np
import pandas as pd
TARGET_COL = "sales_anom"
LAG_ORDERS = (1, 2)
EXOG_FEATURE_COLS = [
"temp_anom",
"fuel_anom",
"MarkDown1",
"MarkDown2",
"MarkDown3",
"MarkDown4",
"MarkDown5",
"CPI",
"Unemployment",
]
ANOM_FEATURE_COLS = [
"lag1",
"lag2",
"week_of_year",
"month",
"is_holiday_int",
*EXOG_FEATURE_COLS,
]
def load_data(train_path: Path, test_path: Path, max_series: int | None = None) -> tuple[pd.DataFrame, pd.DataFrame]:
train = pd.read_parquet(train_path).copy()
test = pd.read_parquet(test_path).copy()
for frame in (train, test):
frame["Date"] = pd.to_datetime(frame["Date"], errors="coerce")
frame["Store"] = frame["Store"].astype(int)
frame["Dept"] = frame["Dept"].astype(int)
frame["series_id"] = frame["Store"].astype(str) + "_" + frame["Dept"].astype(str)
frame["week_of_year"] = frame["Date"].dt.isocalendar().week.astype(int)
frame["month"] = frame["Date"].dt.month.astype(int)
frame["is_holiday_int"] = frame["IsHoliday"].astype(int)
if max_series is not None:
keep_ids = train["series_id"].drop_duplicates().sort_values().head(max_series).tolist()
train = train[train["series_id"].isin(keep_ids)].copy()
test = test[test["series_id"].isin(keep_ids)].copy()
train = train.sort_values(["Date", "Store", "Dept"]).reset_index(drop=True)
test = test.sort_values(["Date", "Store", "Dept"]).reset_index(drop=True)
return train, test
def build_forward_folds(train_df: pd.DataFrame, n_folds: int, val_weeks: int) -> list[tuple[int, np.ndarray, np.ndarray]]:
unique_dates = np.array(sorted(train_df["Date"].dropna().unique()))
initial_train = len(unique_dates) - (n_folds * val_weeks)
if initial_train <= 0:
raise ValueError(
f"Not enough unique weeks ({len(unique_dates)}) for n_folds={n_folds}, val_weeks={val_weeks}."
)
folds: list[tuple[int, np.ndarray, np.ndarray]] = []
for fold in range(n_folds):
train_end = initial_train + fold * val_weeks
val_end = train_end + val_weeks
train_dates = unique_dates[:train_end]
val_dates = unique_dates[train_end:val_end]
folds.append((fold + 1, train_dates, val_dates))
return folds
def ensure_output_dir(path: Path) -> Path:
path.mkdir(parents=True, exist_ok=True)
return path
def wmae(y_true: np.ndarray, y_pred: np.ndarray, holiday: np.ndarray) -> float:
weights = np.where(holiday, 5.0, 1.0)
return float(np.average(np.abs(y_true - y_pred), weights=weights))
def summarize_metrics(df: pd.DataFrame) -> dict[str, float]:
y_true = df["Weekly_Sales"].to_numpy(dtype=float)
y_pred = df["pred_sales"].to_numpy(dtype=float)
holiday = df["IsHoliday"].to_numpy(dtype=bool)
metrics = {
"wmae": wmae(y_true, y_pred, holiday),
"mae": float(np.mean(np.abs(y_true - y_pred))),
"rmse": float(np.sqrt(np.mean((y_true - y_pred) ** 2))),
"n_rows": float(df.shape[0]),
}
if "sales_anom" in df.columns and "pred_anom" in df.columns:
metrics["anom_mae"] = float(np.mean(np.abs(df["sales_anom"] - df["pred_anom"])))
return metrics
def _get_lag_value(hist: deque[float], lag: int) -> float:
if len(hist) >= lag:
return float(hist[-lag])
return np.nan
def _robust_bounds(values: np.ndarray, clip_scale: float = 2.0) -> tuple[float, float]:
arr = np.asarray(values, dtype=float)
arr = arr[np.isfinite(arr)]
if arr.size == 0:
return (-1.0, 1.0)
q01, q99 = np.quantile(arr, [0.01, 0.99])
std = float(np.std(arr))
span = max(float(q99 - q01), std, 1.0)
lower = float(q01 - clip_scale * span)
upper = float(q99 + clip_scale * span)
if lower >= upper:
center = float(np.median(arr))
lower, upper = center - 1.0, center + 1.0
return (lower, upper)
def initialize_target_history(train_df: pd.DataFrame, max_lag: int) -> dict[str, deque[float]]:
history: dict[str, deque[float]] = {}
ordered = train_df.sort_values(["series_id", "Date"])
for series_id, group in ordered.groupby("series_id", sort=False):
values = group[TARGET_COL].dropna().to_numpy(dtype=float)
history[series_id] = deque(values.tolist(), maxlen=max_lag)
return history
def make_supervised_lag_frame(
train_df: pd.DataFrame,
lag_orders: tuple[int, ...] = LAG_ORDERS,
feature_cols: list[str] = ANOM_FEATURE_COLS,
) -> tuple[pd.DataFrame, np.ndarray]:
out = train_df.sort_values(["series_id", "Date"]).copy()
grouped = out.groupby("series_id")[TARGET_COL]
for lag in lag_orders:
out[f"lag{lag}"] = grouped.shift(lag)
X = out[feature_cols].copy()
y = out[TARGET_COL].to_numpy(dtype=float)
valid_mask = np.isfinite(y) & np.all(np.isfinite(X.to_numpy(dtype=float)), axis=1)
return X.loc[valid_mask].reset_index(drop=True), y[valid_mask]
def recursive_predict_tabular_anomaly(
future_df: pd.DataFrame,
model,
history: dict[str, deque[float]],
lag_orders: tuple[int, ...] = LAG_ORDERS,
feature_cols: list[str] = ANOM_FEATURE_COLS,
lower: float = -np.inf,
upper: float = np.inf,
) -> pd.DataFrame:
max_lag = max(lag_orders)
ordered = future_df.sort_values(["Date", "series_id"]).copy()
ordered["row_id"] = np.arange(ordered.shape[0], dtype=int)
preds: list[tuple[int, float, float]] = []
for _, block in ordered.groupby("Date", sort=True):
rows = []
row_ids = block["row_id"].to_numpy(dtype=int)
series_ids = block["series_id"].tolist()
clim_sales = block["clim_sales"].to_numpy(dtype=float)
for row in block.itertuples(index=False):
hist = history.get(row.series_id)
if hist is None:
hist = deque(maxlen=max_lag)
history[row.series_id] = hist
lag_features = {f"lag{lag}": _get_lag_value(hist, lag) for lag in lag_orders}
rows.append(
{
**lag_features,
"week_of_year": float(row.week_of_year),
"month": float(row.month),
"is_holiday_int": float(row.is_holiday_int),
"temp_anom": float(row.temp_anom),
"fuel_anom": float(row.fuel_anom),
"MarkDown1": float(row.MarkDown1),
"MarkDown2": float(row.MarkDown2),
"MarkDown3": float(row.MarkDown3),
"MarkDown4": float(row.MarkDown4),
"MarkDown5": float(row.MarkDown5),
"CPI": float(row.CPI),
"Unemployment": float(row.Unemployment),
}
)
X_step = pd.DataFrame(rows, columns=feature_cols)
pred_anom_step = np.asarray(model.predict(X_step), dtype=float)
pred_anom_step = np.clip(pred_anom_step, lower, upper)
pred_sales_step = pred_anom_step + clim_sales
for row_id, series_id, pred_sales, pred_anom in zip(
row_ids, series_ids, pred_sales_step, pred_anom_step
):
preds.append((int(row_id), float(pred_sales), float(pred_anom)))
history[series_id].append(float(pred_anom))
pred_df = pd.DataFrame(preds, columns=["row_id", "pred_sales", "pred_anom"])
return ordered.merge(pred_df, on="row_id", how="left")
def run_cv_tabular_baseline_anomaly(
train_df: pd.DataFrame,
model_builder: Callable[[], object],
n_folds: int,
val_weeks: int,
lag_orders: tuple[int, ...] = LAG_ORDERS,
feature_cols: list[str] = ANOM_FEATURE_COLS,
) -> tuple[pd.DataFrame, pd.DataFrame]:
folds = build_forward_folds(train_df, n_folds=n_folds, val_weeks=val_weeks)
metrics_rows: list[dict[str, float | int | str]] = []
oof_parts: list[pd.DataFrame] = []
max_lag = max(lag_orders)
for fold_id, train_dates, val_dates in folds:
print(f"\nFold {fold_id}: train={len(train_dates)} weeks, val={len(val_dates)} weeks")
t0 = time.perf_counter()
fold_train_raw = train_df[train_df["Date"].isin(train_dates)].copy()
fold_val_raw = train_df[train_df["Date"].isin(val_dates)].copy()
climatology = compute_climatology(fold_train_raw)
fold_train = attach_climatology(fold_train_raw, climatology).sort_values(["series_id", "Date"]).copy()
fold_val = attach_climatology(fold_val_raw, climatology).sort_values(["Date", "series_id"]).copy()
X_train, y_train = make_supervised_lag_frame(fold_train, lag_orders=lag_orders, feature_cols=feature_cols)
model = model_builder()
model.fit(X_train, y_train)
history = initialize_target_history(fold_train, max_lag=max_lag)
lower, upper = _robust_bounds(fold_train[TARGET_COL].to_numpy(dtype=float), clip_scale=2.0)
fold_val_pred = recursive_predict_tabular_anomaly(
fold_val,
model,
history,
lag_orders=lag_orders,
feature_cols=feature_cols,
lower=lower,
upper=upper,
)
fold_metrics = summarize_metrics(fold_val_pred)
fold_metrics["runtime_sec"] = float(time.perf_counter() - t0)
fold_metrics["fold"] = fold_id
fold_metrics["train_start"] = str(pd.Timestamp(train_dates[0]).date())
fold_metrics["train_end"] = str(pd.Timestamp(train_dates[-1]).date())
fold_metrics["val_start"] = str(pd.Timestamp(val_dates[0]).date())
fold_metrics["val_end"] = str(pd.Timestamp(val_dates[-1]).date())
metrics_rows.append(fold_metrics)
print(
f" wmae={fold_metrics['wmae']:.4f} mae={fold_metrics['mae']:.4f} "
f"rmse={fold_metrics['rmse']:.4f} runtime={fold_metrics['runtime_sec']:.1f}s"
)
keep_cols = [
"Store",
"Dept",
"Date",
"IsHoliday",
"Weekly_Sales",
"sales_anom",
"pred_anom",
"pred_sales",
"series_id",
]
part = fold_val_pred[keep_cols].copy()
part["fold"] = fold_id
oof_parts.append(part)
metrics_df = pd.DataFrame(metrics_rows)
if not metrics_df.empty:
mean_row = {"fold": "mean"}
for col in ["wmae", "mae", "rmse", "anom_mae", "n_rows", "runtime_sec"]:
if col in metrics_df.columns:
mean_row[col] = float(metrics_df[col].mean())
metrics_df = pd.concat([metrics_df, pd.DataFrame([mean_row])], ignore_index=True)
oof_df = pd.concat(oof_parts, ignore_index=True) if oof_parts else pd.DataFrame()
return metrics_df, oof_df
def fit_full_train_and_predict_test_tabular_anomaly(
train_df: pd.DataFrame,
test_df: pd.DataFrame,
model_builder: Callable[[], object],
lag_orders: tuple[int, ...] = LAG_ORDERS,
feature_cols: list[str] = ANOM_FEATURE_COLS,
) -> pd.DataFrame:
climatology = compute_climatology(train_df)
full_train = attach_climatology(train_df.copy(), climatology).sort_values(["series_id", "Date"]).copy()
full_test = attach_climatology(test_df.copy(), climatology).sort_values(["Date", "series_id"]).copy()
X_train, y_train = make_supervised_lag_frame(full_train, lag_orders=lag_orders, feature_cols=feature_cols)
model = model_builder()
model.fit(X_train, y_train)
history = initialize_target_history(full_train, max_lag=max(lag_orders))
lower, upper = _robust_bounds(full_train[TARGET_COL].to_numpy(dtype=float), clip_scale=2.0)
test_pred = recursive_predict_tabular_anomaly(
full_test,
model,
history,
lag_orders=lag_orders,
feature_cols=feature_cols,
lower=lower,
upper=upper,
)
return test_pred[
["Store", "Dept", "Date", "IsHoliday", "pred_sales", "pred_anom", "series_id"]
].copy()2.6 Phase 2 Definitions
from __future__ import annotations
import argparse
import json
import time
from collections import deque
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import pandas as pd
import pymc as pm
TARGET_COL = "sales_anom"
PERIOD_WEEKS = 52.0
LAG_ORDERS = (1, 2)
LAG_COLS = [f"sales_anom_lag{lag}" for lag in LAG_ORDERS]
EXOG_COLS = [
"temp_anom",
"fuel_anom",
"MarkDown1",
"MarkDown2",
"MarkDown3",
"MarkDown4",
"MarkDown5",
"CPI",
"Unemployment",
]
@dataclass(slots=True)
class ScaleStats:
mean: float
std: float
@dataclass(slots=True)
class StructuralARConfig:
n_folds: int
val_weeks: int
max_eval: int
random_seed: int
pred_draws: int
fourier_order: int
sigma_clusters: int
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Phase 2: Structural AR + trend + seasonal Fourier model with recursive validation."
)
parser.add_argument("--train-path", default="train_feat.parquet")
parser.add_argument("--test-path", default="test_feat.parquet")
parser.add_argument("--output-dir", default="outputs/phase2_structural_ar")
parser.add_argument("--n-folds", type=int, default=4)
parser.add_argument("--val-weeks", type=int, default=13)
parser.add_argument("--max-eval", type=int, default=3000)
parser.add_argument("--random-seed", type=int, default=8927)
parser.add_argument("--pred-draws", type=int, default=80)
parser.add_argument("--fourier-order", type=int, default=3)
parser.add_argument(
"--sigma-clusters",
type=int,
default=0,
help="Number of series-level noise clusters for heteroskedastic sigma (0 disables).",
)
parser.add_argument(
"--max-series",
type=int,
default=200,
help="Subset of Store-Dept series for tractable PyMC structural model.",
)
return parser.parse_args()
def ensure_dirs(base: Path) -> dict[str, Path]:
paths = {"root": base, "metrics": base / "metrics", "preds": base / "preds"}
for p in paths.values():
p.mkdir(parents=True, exist_ok=True)
return paths
def fit_standardizer(series: pd.Series) -> ScaleStats:
mean = float(series.mean())
std = float(series.std())
if not np.isfinite(std) or std < 1e-8:
std = 1.0
return ScaleStats(mean=mean, std=std)
def zscore(values: np.ndarray, stats: ScaleStats) -> np.ndarray:
return (values - stats.mean) / stats.std
def add_structural_features(
train_df: pd.DataFrame,
val_df: pd.DataFrame,
fourier_order: int,
) -> tuple[pd.DataFrame, pd.DataFrame, list[str]]:
min_date = train_df["Date"].min()
for frame in (train_df, val_df):
frame["t_week"] = ((frame["Date"] - min_date).dt.days // 7).astype(float)
frame["trend"] = frame["t_week"]
fourier_cols: list[str] = []
for k in range(1, fourier_order + 1):
sin_col = f"ff_sin_{k}"
cos_col = f"ff_cos_{k}"
angle_train = 2.0 * np.pi * k * train_df["t_week"] / PERIOD_WEEKS
angle_val = 2.0 * np.pi * k * val_df["t_week"] / PERIOD_WEEKS
train_df[sin_col] = np.sin(angle_train)
train_df[cos_col] = np.cos(angle_train)
val_df[sin_col] = np.sin(angle_val)
val_df[cos_col] = np.cos(angle_val)
fourier_cols.extend([sin_col, cos_col])
return train_df, val_df, fourier_cols
def prepare_fold_data(
train_df: pd.DataFrame,
train_dates: np.ndarray,
val_dates: np.ndarray,
fourier_order: int,
) -> tuple[pd.DataFrame, pd.DataFrame, list[str]]:
fold_train_raw = train_df[train_df["Date"].isin(train_dates)].copy()
fold_val_raw = train_df[train_df["Date"].isin(val_dates)].copy()
fold_train_raw["split"] = "train"
fold_val_raw["split"] = "val"
climatology = compute_climatology(fold_train_raw)
combined = pd.concat([fold_train_raw, fold_val_raw], ignore_index=True)
combined = attach_climatology(combined, climatology)
combined = add_anomaly_lags(combined)
combined = combined.sort_values(["Date", "Store", "Dept"]).reset_index(drop=True)
fold_train = combined[combined["split"] == "train"].copy()
fold_val = combined[combined["split"] == "val"].copy()
# Keep early rows by treating unavailable long-lag anomalies as neutral (0).
for c in LAG_COLS:
fold_train[c] = fold_train[c].fillna(0.0)
fold_val[c] = fold_val[c].fillna(0.0)
for c in EXOG_COLS:
median = float(fold_train[c].median())
if not np.isfinite(median):
median = 0.0
fold_train[c] = fold_train[c].fillna(median)
fold_val[c] = fold_val[c].fillna(median)
fold_train = fold_train.dropna(subset=[TARGET_COL]).reset_index(drop=True)
fold_val = fold_val.dropna(subset=[TARGET_COL]).reset_index(drop=True)
fold_train, fold_val, fourier_cols = add_structural_features(
fold_train, fold_val, fourier_order=fourier_order
)
return fold_train, fold_val, fourier_cols
def _lag_from_history(hist: deque[float], lag: int) -> float:
if len(hist) >= lag:
return float(hist[-lag])
return 0.0
def initialize_history(train_part: pd.DataFrame) -> dict[str, deque[float]]:
history: dict[str, deque[float]] = {}
max_lag = max(LAG_ORDERS)
ordered = train_part.sort_values(["series_id", "Date"])
for series_id, grp in ordered.groupby("series_id", sort=False):
vals = grp[TARGET_COL].dropna().to_numpy(dtype=float)
history[series_id] = deque(vals.tolist(), maxlen=max_lag)
return history
def build_sigma_clusters(train_part: pd.DataFrame, n_clusters: int) -> dict[str, int]:
if n_clusters <= 1:
return {}
series_stats = (
train_part.groupby("series_id", as_index=False)[TARGET_COL]
.std()
.rename(columns={TARGET_COL: "anom_std"})
)
series_stats["anom_std"] = series_stats["anom_std"].fillna(0.0)
k = int(max(1, min(n_clusters, series_stats.shape[0])))
if k == 1:
return {sid: 0 for sid in series_stats["series_id"].tolist()}
# Quantile bins by anomaly volatility to capture heteroskedastic groups.
labels = pd.qcut(series_stats["anom_std"], q=k, labels=False, duplicates="drop")
labels = labels.fillna(0).astype(int)
return dict(zip(series_stats["series_id"], labels))
def fit_structural_model(
train_part: pd.DataFrame,
fourier_cols: list[str],
cfg: StructuralARConfig,
) -> dict:
y_stats = fit_standardizer(train_part[TARGET_COL])
lag_stats = [fit_standardizer(train_part[c]) for c in LAG_COLS]
exog_stats = [fit_standardizer(train_part[c]) for c in EXOG_COLS]
trend_stats = fit_standardizer(train_part["trend"])
y_train = zscore(train_part[TARGET_COL].to_numpy(dtype=float), y_stats)
lag_z_matrix = np.column_stack(
[
zscore(train_part[col].to_numpy(dtype=float), stats)
for col, stats in zip(LAG_COLS, lag_stats)
]
)
exog_z_matrix = np.column_stack(
[
zscore(train_part[col].to_numpy(dtype=float), stats)
for col, stats in zip(EXOG_COLS, exog_stats)
]
)
trend_z = zscore(train_part["trend"].to_numpy(dtype=float), trend_stats)
holiday = train_part["is_holiday_int"].to_numpy(dtype=float)
ff_matrix = train_part[fourier_cols].to_numpy(dtype=float)
series_ids = train_part["series_id"].drop_duplicates().sort_values().tolist()
series_to_idx = {sid: i for i, sid in enumerate(series_ids)}
train_series_idx = train_part["series_id"].map(series_to_idx).to_numpy(dtype=int)
n_series = len(series_ids)
series_to_cluster: dict[str, int] = {}
cluster_idx: np.ndarray | None = None
n_cluster = 0
if cfg.sigma_clusters > 1:
series_to_cluster = build_sigma_clusters(train_part, cfg.sigma_clusters)
if series_to_cluster:
cluster_idx = train_part["series_id"].map(series_to_cluster).fillna(0).to_numpy(dtype=int)
n_cluster = int(np.max(cluster_idx)) + 1
with pm.Model() as model:
alpha_mu = pm.Normal("alpha_mu", mu=0.0, sigma=1.0)
alpha_sigma = pm.HalfNormal("alpha_sigma", sigma=1.0)
z_alpha = pm.Normal("z_alpha", mu=0.0, sigma=1.0, shape=n_series)
alpha_series = pm.Deterministic("alpha_series", alpha_mu + alpha_sigma * z_alpha)
beta_lags = pm.Normal("beta_lags", mu=0.0, sigma=0.6, shape=len(LAG_COLS))
beta_exog = pm.Normal("beta_exog", mu=0.0, sigma=0.5, shape=len(EXOG_COLS))
beta_holiday = pm.Normal("beta_holiday", mu=0.0, sigma=0.8)
beta_trend = pm.Normal("beta_trend", mu=0.0, sigma=0.5)
beta_fourier = pm.Normal("beta_fourier", mu=0.0, sigma=0.5, shape=ff_matrix.shape[1])
if cluster_idx is not None and n_cluster > 1:
sigma_cluster = pm.HalfNormal("sigma_cluster", sigma=0.8, shape=n_cluster)
sigma_obs = sigma_cluster[cluster_idx]
else:
sigma = pm.HalfNormal("sigma", sigma=0.8)
sigma_obs = sigma
mu = (
alpha_series[train_series_idx]
+ pm.math.dot(lag_z_matrix, beta_lags)
+ pm.math.dot(exog_z_matrix, beta_exog)
+ beta_holiday * holiday
+ beta_trend * trend_z
+ pm.math.dot(ff_matrix, beta_fourier)
)
pm.Normal("y_obs", mu=mu, sigma=sigma_obs, observed=y_train)
map_point = pm.find_MAP(
progressbar=False,
maxeval=cfg.max_eval,
return_raw=False,
seed=cfg.random_seed,
)
train_vals = train_part[TARGET_COL].to_numpy(dtype=float)
q01, q99 = np.quantile(train_vals, [0.01, 0.99])
span = max(float(q99 - q01), float(np.std(train_vals)), 1.0)
lower = float(q01 - 2.0 * span)
upper = float(q99 + 2.0 * span)
fitted = {
"map_point": map_point,
"series_to_idx": series_to_idx,
"y_stats": y_stats,
"lag_stats": lag_stats,
"exog_stats": exog_stats,
"exog_cols": EXOG_COLS,
"trend_stats": trend_stats,
"fourier_cols": fourier_cols,
"history_base": initialize_history(train_part),
"clip_bounds": (lower, upper),
"series_to_cluster": series_to_cluster,
}
return fitted
def recursive_predict_with_uncertainty(
val_part: pd.DataFrame,
fitted: dict,
cfg: StructuralARConfig,
) -> tuple[np.ndarray, np.ndarray]:
ordered = (
val_part.reset_index()
.rename(columns={"index": "orig_idx"})
.sort_values(["Date", "series_id"])
.reset_index(drop=True)
)
mp = fitted["map_point"]
series_to_idx = fitted["series_to_idx"]
alpha_series = np.asarray(mp["alpha_series"], dtype=float)
alpha_mu = float(mp["alpha_mu"])
beta_lags = np.asarray(mp["beta_lags"], dtype=float)
beta_exog = np.asarray(mp["beta_exog"], dtype=float)
beta_holiday = float(mp["beta_holiday"])
beta_trend = float(mp["beta_trend"])
beta_fourier = np.asarray(mp["beta_fourier"], dtype=float)
sigma_global = float(mp["sigma"]) if "sigma" in mp else None
sigma_cluster = np.asarray(mp["sigma_cluster"], dtype=float) if "sigma_cluster" in mp else None
y_stats = fitted["y_stats"]
lag_stats = fitted["lag_stats"]
exog_stats = fitted["exog_stats"]
exog_cols = fitted["exog_cols"]
trend_stats = fitted["trend_stats"]
fourier_cols = fitted["fourier_cols"]
series_to_cluster = fitted.get("series_to_cluster", {})
lower, upper = fitted["clip_bounds"]
sid_arr = ordered["series_id"].to_numpy()
exog_arr = ordered[exog_cols].to_numpy(dtype=float)
holiday_arr = ordered["is_holiday_int"].to_numpy(dtype=float)
trend_arr = ordered["trend"].to_numpy(dtype=float)
ff_arr = ordered[fourier_cols].to_numpy(dtype=float)
orig_idx = ordered["orig_idx"].to_numpy(dtype=int)
n = len(ordered)
draws = max(1, cfg.pred_draws)
draw_preds = np.empty((draws, n), dtype=float)
rng = np.random.default_rng(cfg.random_seed)
for d in range(draws):
history = {sid: deque(vals, maxlen=max(LAG_ORDERS)) for sid, vals in fitted["history_base"].items()}
for i in range(n):
sid = sid_arr[i]
hist = history.get(sid)
if hist is None:
hist = deque(maxlen=max(LAG_ORDERS))
history[sid] = hist
sid_idx = series_to_idx.get(sid)
alpha = alpha_series[sid_idx] if sid_idx is not None else alpha_mu
lag_z_vals = [
(_lag_from_history(hist, lag) - stats.mean) / stats.std
for lag, stats in zip(LAG_ORDERS, lag_stats)
]
exog_z_vals = [
(exog_arr[i, j] - stats.mean) / stats.std
for j, stats in enumerate(exog_stats)
]
trend_z = (trend_arr[i] - trend_stats.mean) / trend_stats.std
pred_z = (
alpha
+ float(np.dot(lag_z_vals, beta_lags))
+ float(np.dot(exog_z_vals, beta_exog))
+ beta_holiday * holiday_arr[i]
+ beta_trend * trend_z
+ float(np.dot(ff_arr[i], beta_fourier))
)
if sigma_cluster is not None and series_to_cluster:
c_idx = int(series_to_cluster.get(sid, 0))
sigma_draw = float(sigma_cluster[min(c_idx, len(sigma_cluster) - 1)])
else:
sigma_draw = float(sigma_global if sigma_global is not None else 1.0)
pred_z += float(rng.normal(loc=0.0, scale=sigma_draw))
pred_raw = float(pred_z * y_stats.std + y_stats.mean)
pred_raw = float(np.clip(pred_raw, lower, upper))
draw_preds[d, i] = pred_raw
hist.append(pred_raw)
mean_ordered = draw_preds.mean(axis=0)
sd_ordered = draw_preds.std(axis=0, ddof=0)
mean_series = pd.Series(mean_ordered, index=orig_idx)
sd_series = pd.Series(sd_ordered, index=orig_idx)
pred_mean = mean_series.reindex(val_part.index).to_numpy(dtype=float)
pred_sd = sd_series.reindex(val_part.index).to_numpy(dtype=float)
return pred_mean, pred_sd
def evaluate_fold(val_df: pd.DataFrame) -> dict[str, float]:
y_true = val_df["Weekly_Sales"].to_numpy(dtype=float)
y_pred = val_df["pred_sales"].to_numpy(dtype=float)
holiday = val_df["IsHoliday"].to_numpy(dtype=bool)
return {
"wmae": wmae(y_true, y_pred, holiday),
"mae": float(np.mean(np.abs(y_true - y_pred))),
"rmse": float(np.sqrt(np.mean((y_true - y_pred) ** 2))),
"anom_mae": float(np.mean(np.abs(val_df["sales_anom"] - val_df["pred_anom"]))),
"n_rows": float(len(val_df)),
}
def run_cv(train_df: pd.DataFrame, cfg: StructuralARConfig) -> tuple[pd.DataFrame, pd.DataFrame]:
folds = build_forward_folds(train_df, n_folds=cfg.n_folds, val_weeks=cfg.val_weeks)
metrics_rows: list[dict] = []
oof_parts: list[pd.DataFrame] = []
for fold_id, train_dates, val_dates in folds:
print(f"\nFold {fold_id}: train={len(train_dates)} weeks, val={len(val_dates)} weeks")
fold_train, fold_val, fourier_cols = prepare_fold_data(
train_df, train_dates, val_dates, fourier_order=cfg.fourier_order
)
print(f" rows used -> train={len(fold_train)}, val={len(fold_val)}")
t0 = time.perf_counter()
fitted = fit_structural_model(fold_train, fourier_cols=fourier_cols, cfg=cfg)
pred_anom, pred_anom_sd = recursive_predict_with_uncertainty(fold_val, fitted=fitted, cfg=cfg)
elapsed = time.perf_counter() - t0
fold_val = fold_val.copy()
fold_val["pred_anom"] = pred_anom
fold_val["pred_anom_sd"] = pred_anom_sd
fold_val["pred_sales"] = fold_val["pred_anom"] + fold_val["clim_sales"]
fold_val["pred_sales_sd"] = fold_val["pred_anom_sd"]
fold_val["pred_sales_lower"] = fold_val["pred_sales"] - 1.96 * fold_val["pred_sales_sd"]
fold_val["pred_sales_upper"] = fold_val["pred_sales"] + 1.96 * fold_val["pred_sales_sd"]
fold_val["fold"] = fold_id
fold_metrics = evaluate_fold(fold_val)
fold_metrics["fold"] = fold_id
fold_metrics["train_start"] = str(pd.Timestamp(train_dates[0]).date())
fold_metrics["train_end"] = str(pd.Timestamp(train_dates[-1]).date())
fold_metrics["val_start"] = str(pd.Timestamp(val_dates[0]).date())
fold_metrics["val_end"] = str(pd.Timestamp(val_dates[-1]).date())
fold_metrics["runtime_sec"] = float(elapsed)
metrics_rows.append(fold_metrics)
print(
f" wmae={fold_metrics['wmae']:.4f} mae={fold_metrics['mae']:.4f} "
f"rmse={fold_metrics['rmse']:.4f} runtime={elapsed:.1f}s"
)
keep_cols = [
"Store",
"Dept",
"Date",
"IsHoliday",
"Weekly_Sales",
"sales_anom",
"pred_anom",
"pred_anom_sd",
"pred_sales",
"pred_sales_sd",
"pred_sales_lower",
"pred_sales_upper",
"fold",
]
oof_parts.append(fold_val[keep_cols].copy())
metrics_df = pd.DataFrame(metrics_rows)
if not metrics_df.empty:
mean_row = {"fold": "mean"}
for col in ["wmae", "mae", "rmse", "anom_mae", "n_rows", "runtime_sec"]:
mean_row[col] = float(metrics_df[col].mean())
metrics_df = pd.concat([metrics_df, pd.DataFrame([mean_row])], ignore_index=True)
oof_df = pd.concat(oof_parts, ignore_index=True) if oof_parts else pd.DataFrame()
return metrics_df, oof_df
def main() -> None:
args = parse_args()
out = ensure_dirs(Path(args.output_dir))
cfg = StructuralARConfig(
n_folds=args.n_folds,
val_weeks=args.val_weeks,
max_eval=args.max_eval,
random_seed=args.random_seed,
pred_draws=args.pred_draws,
fourier_order=args.fourier_order,
sigma_clusters=args.sigma_clusters,
)
print("Loading data...")
train_df, _ = load_data(
Path(args.train_path),
Path(args.test_path),
max_series=args.max_series,
)
print(
f"rows={len(train_df)}, series={train_df['series_id'].nunique()}, "
f"max_series={args.max_series}"
)
metrics_df, oof_df = run_cv(train_df, cfg=cfg)
metrics_path = out["metrics"] / "structural_ar_cv.csv"
oof_path = out["preds"] / "structural_ar_oof.parquet"
metrics_df.to_csv(metrics_path, index=False)
oof_df.to_parquet(oof_path, index=False)
cfg_payload = {
"n_folds": args.n_folds,
"val_weeks": args.val_weeks,
"max_eval": args.max_eval,
"random_seed": args.random_seed,
"pred_draws": args.pred_draws,
"fourier_order": args.fourier_order,
"lag_orders": list(LAG_ORDERS),
"exogenous_features": EXOG_COLS,
"sigma_clusters": args.sigma_clusters,
"max_series": args.max_series,
"note": "Structural AR model with hierarchical series intercept, trend, Fourier seasonality, and recursive validation.",
}
cfg_path = out["metrics"] / "structural_ar_config.json"
cfg_path.write_text(json.dumps(cfg_payload, indent=2), encoding="utf-8")
print(f"Saved metrics -> {metrics_path}")
print(f"Saved OOF preds -> {oof_path}")
print(f"Saved config -> {cfg_path}")
phase2_main = main2.7 Phase 2 Run
run_with_argv(
phase2_main,
[
"phase2_structural_ar",
"--output-dir", "outputs/phase2_structural_ar_full",
"--max-series", "3331",
"--max-eval", "1800",
"--pred-draws", "40",
"--n-folds", "4",
"--val-weeks", "13",
"--fourier-order", "3",
"--random-seed", "8927",
"--sigma-clusters", "0",
],
)2.8 Phase 3 Definitions
from __future__ import annotations
import argparse
import json
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Phase 3: Elastic Net anomaly baseline with recursive forward-chaining evaluation."
)
parser.add_argument("--train-path", default="train_feat.parquet")
parser.add_argument("--test-path", default="test_feat.parquet")
parser.add_argument("--output-dir", default="outputs/baselines/elastic_net")
parser.add_argument("--n-folds", type=int, default=4)
parser.add_argument("--val-weeks", type=int, default=13)
parser.add_argument("--max-series", type=int, default=None)
parser.add_argument("--alpha", type=float, default=0.02)
parser.add_argument("--l1-ratio", type=float, default=0.2)
parser.add_argument("--max-iter", type=int, default=10000)
return parser.parse_args()
def make_model(alpha: float, l1_ratio: float, max_iter: int):
return Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
(
"model",
ElasticNet(
alpha=alpha,
l1_ratio=l1_ratio,
max_iter=max_iter,
random_state=42,
),
),
]
)
def main() -> None:
args = parse_args()
output_dir = ensure_output_dir(Path(args.output_dir))
print("Loading data...")
train_df, test_df = load_data(Path(args.train_path), Path(args.test_path), max_series=args.max_series)
print(
f"train rows={train_df.shape[0]}, test rows={test_df.shape[0]}, "
f"train series={train_df['series_id'].nunique()}, test series={test_df['series_id'].nunique()}"
)
print("\nRunning recursive forward-chaining CV...")
metrics_df, oof_df = run_cv_tabular_baseline_anomaly(
train_df,
model_builder=lambda: make_model(args.alpha, args.l1_ratio, args.max_iter),
n_folds=args.n_folds,
val_weeks=args.val_weeks,
lag_orders=LAG_ORDERS,
feature_cols=ANOM_FEATURE_COLS,
)
metrics_path = output_dir / "metrics.csv"
metrics_df.to_csv(metrics_path, index=False)
print(f"Saved metrics -> {metrics_path}")
oof_path = output_dir / "oof.parquet"
oof_df.to_parquet(oof_path, index=False)
print(f"Saved OOF predictions -> {oof_path}")
print("\nFitting full train and forecasting test recursively...")
test_pred = fit_full_train_and_predict_test_tabular_anomaly(
train_df,
test_df,
model_builder=lambda: make_model(args.alpha, args.l1_ratio, args.max_iter),
lag_orders=LAG_ORDERS,
feature_cols=ANOM_FEATURE_COLS,
)
test_path = output_dir / "test.parquet"
test_pred.to_parquet(test_path, index=False)
print(f"Saved test predictions -> {test_path}")
config = {
"model": "elastic_net",
"target": "sales_anom",
"features": ANOM_FEATURE_COLS,
"lags": list(LAG_ORDERS),
"alpha": args.alpha,
"l1_ratio": args.l1_ratio,
"max_iter": args.max_iter,
"n_folds": args.n_folds,
"val_weeks": args.val_weeks,
"max_series": args.max_series,
"train_path": args.train_path,
"test_path": args.test_path,
}
config_path = output_dir / "config.json"
config_path.write_text(json.dumps(config, indent=2), encoding="utf-8")
print(f"Saved config -> {config_path}")
phase3_main = main2.9 Phase 3 Run
run_with_argv(phase3_main, ["phase3_elastic_net"])2.10 Phase 4 Definitions
from __future__ import annotations
import argparse
import json
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Phase 4: Random Forest anomaly baseline with recursive forward-chaining evaluation."
)
parser.add_argument("--train-path", default="train_feat.parquet")
parser.add_argument("--test-path", default="test_feat.parquet")
parser.add_argument("--output-dir", default="outputs/baselines/random_forest")
parser.add_argument("--n-folds", type=int, default=4)
parser.add_argument("--val-weeks", type=int, default=13)
parser.add_argument("--max-series", type=int, default=None)
parser.add_argument("--n-estimators", type=int, default=120)
parser.add_argument("--max-depth", type=int, default=18)
parser.add_argument("--min-samples-leaf", type=int, default=2)
parser.add_argument("--max-features", default="sqrt")
return parser.parse_args()
def make_model(n_estimators: int, max_depth: int, min_samples_leaf: int, max_features: str):
return Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
(
"model",
RandomForestRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
max_features=max_features,
random_state=42,
n_jobs=-1,
),
),
]
)
def main() -> None:
args = parse_args()
output_dir = ensure_output_dir(Path(args.output_dir))
print("Loading data...")
train_df, test_df = load_data(Path(args.train_path), Path(args.test_path), max_series=args.max_series)
print(
f"train rows={train_df.shape[0]}, test rows={test_df.shape[0]}, "
f"train series={train_df['series_id'].nunique()}, test series={test_df['series_id'].nunique()}"
)
print("\nRunning recursive forward-chaining CV...")
metrics_df, oof_df = run_cv_tabular_baseline_anomaly(
train_df,
model_builder=lambda: make_model(
args.n_estimators,
args.max_depth,
args.min_samples_leaf,
args.max_features,
),
n_folds=args.n_folds,
val_weeks=args.val_weeks,
lag_orders=LAG_ORDERS,
feature_cols=ANOM_FEATURE_COLS,
)
metrics_path = output_dir / "metrics.csv"
metrics_df.to_csv(metrics_path, index=False)
print(f"Saved metrics -> {metrics_path}")
oof_path = output_dir / "oof.parquet"
oof_df.to_parquet(oof_path, index=False)
print(f"Saved OOF predictions -> {oof_path}")
print("\nFitting full train and forecasting test recursively...")
test_pred = fit_full_train_and_predict_test_tabular_anomaly(
train_df,
test_df,
model_builder=lambda: make_model(
args.n_estimators,
args.max_depth,
args.min_samples_leaf,
args.max_features,
),
lag_orders=LAG_ORDERS,
feature_cols=ANOM_FEATURE_COLS,
)
test_path = output_dir / "test.parquet"
test_pred.to_parquet(test_path, index=False)
print(f"Saved test predictions -> {test_path}")
config = {
"model": "random_forest",
"target": "sales_anom",
"features": ANOM_FEATURE_COLS,
"lags": list(LAG_ORDERS),
"n_estimators": args.n_estimators,
"max_depth": args.max_depth,
"min_samples_leaf": args.min_samples_leaf,
"max_features": args.max_features,
"n_folds": args.n_folds,
"val_weeks": args.val_weeks,
"max_series": args.max_series,
"train_path": args.train_path,
"test_path": args.test_path,
}
config_path = output_dir / "config.json"
config_path.write_text(json.dumps(config, indent=2), encoding="utf-8")
print(f"Saved config -> {config_path}")
phase4_main = main2.11 Phase 4 Run
run_with_argv(phase4_main, ["phase4_random_forest"])2.12 Phase 5 Definitions
from __future__ import annotations
import argparse
import json
import time
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.tsa.holtwinters import ExponentialSmoothing
EXOG_FEATURE_COLS = [
"temp_anom",
"fuel_anom",
"MarkDown1",
"MarkDown2",
"MarkDown3",
"MarkDown4",
"MarkDown5",
"CPI",
"Unemployment",
]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Phase 5: ETS anomaly baseline with forward-chaining validation."
)
parser.add_argument("--train-path", default="train_feat.parquet")
parser.add_argument("--test-path", default="test_feat.parquet")
parser.add_argument("--output-dir", default="outputs/baselines/ets")
parser.add_argument("--n-folds", type=int, default=4)
parser.add_argument("--val-weeks", type=int, default=13)
parser.add_argument("--max-series", type=int, default=None)
parser.add_argument("--seasonal-periods", type=int, default=52)
parser.add_argument("--exog-alpha", type=float, default=1.0)
return parser.parse_args()
def make_exog_model(alpha: float) -> Pipeline:
return Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
("model", Ridge(alpha=alpha)),
]
)
def _safe_score(aic: float | None, sse: float | None) -> float:
if aic is not None and np.isfinite(aic):
return float(aic)
if sse is not None and np.isfinite(sse):
return float(sse)
return float("inf")
def forecast_ets_with_fallback(
train_values: np.ndarray,
horizon: int,
seasonal_periods: int,
fallback_value: float,
) -> tuple[np.ndarray, str]:
y = np.asarray(train_values, dtype=float)
y = y[np.isfinite(y)]
if horizon <= 0:
return np.empty(0, dtype=float), "empty-horizon"
if y.size == 0:
return np.full(horizon, fallback_value, dtype=float), "global-fallback"
if y.size < 3:
return np.full(horizon, float(y[-1]), dtype=float), "last-value-fallback"
candidate_configs: list[dict[str, object]] = [
{"trend": None, "seasonal": None, "damped_trend": False, "tag": "level"},
{"trend": "add", "seasonal": None, "damped_trend": False, "tag": "level-trend"},
]
if y.size >= seasonal_periods + 4:
candidate_configs.append(
{"trend": None, "seasonal": "add", "damped_trend": False, "tag": "level-seasonal"}
)
candidate_configs.append(
{"trend": "add", "seasonal": "add", "damped_trend": True, "tag": "trend-seasonal-damped"}
)
best_forecast: np.ndarray | None = None
best_score = float("inf")
best_tag = "last-value-fallback"
for cfg in candidate_configs:
try:
model = ExponentialSmoothing(
y,
trend=cfg["trend"],
seasonal=cfg["seasonal"],
damped_trend=bool(cfg["damped_trend"]),
seasonal_periods=seasonal_periods if cfg["seasonal"] else None,
initialization_method="estimated",
)
fit = model.fit(optimized=True, use_brute=False)
pred = np.asarray(fit.forecast(horizon), dtype=float)
score = _safe_score(getattr(fit, "aic", None), getattr(fit, "sse", None))
if np.all(np.isfinite(pred)) and score < best_score:
best_score = score
best_forecast = pred
best_tag = str(cfg["tag"])
except Exception:
continue
if best_forecast is None:
return np.full(horizon, float(y[-1]), dtype=float), "last-value-fallback"
return best_forecast, best_tag
def run_cv(
train_df: pd.DataFrame,
n_folds: int,
val_weeks: int,
seasonal_periods: int,
exog_alpha: float,
) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, int]]:
folds = build_forward_folds(train_df, n_folds=n_folds, val_weeks=val_weeks)
metrics_rows: list[dict[str, float | int | str]] = []
oof_parts: list[pd.DataFrame] = []
mode_counts: dict[str, int] = {}
for fold_id, train_dates, val_dates in folds:
print(f"\nFold {fold_id}: train={len(train_dates)} weeks, val={len(val_dates)} weeks")
t0 = time.perf_counter()
fold_train_raw = train_df[train_df["Date"].isin(train_dates)].copy()
fold_val_raw = train_df[train_df["Date"].isin(val_dates)].copy()
climatology = compute_climatology(fold_train_raw)
fold_train = attach_climatology(fold_train_raw, climatology)
fold_val = attach_climatology(fold_val_raw, climatology)
exog_model = make_exog_model(exog_alpha)
exog_model.fit(fold_train[EXOG_FEATURE_COLS], fold_train["sales_anom"].to_numpy(dtype=float))
fold_train["exog_anom_pred"] = np.asarray(exog_model.predict(fold_train[EXOG_FEATURE_COLS]), dtype=float)
fold_val["exog_anom_pred"] = np.asarray(exog_model.predict(fold_val[EXOG_FEATURE_COLS]), dtype=float)
fold_train["resid_anom"] = fold_train["sales_anom"] - fold_train["exog_anom_pred"]
fallback = float(fold_train["resid_anom"].median())
fold_preds: list[pd.DataFrame] = []
val_series_ids = fold_val["series_id"].drop_duplicates().tolist()
total_series = len(val_series_ids)
for i, series_id in enumerate(val_series_ids, start=1):
train_series = fold_train[fold_train["series_id"] == series_id].sort_values("Date")
val_series = fold_val[fold_val["series_id"] == series_id].sort_values("Date").copy()
horizon = val_series.shape[0]
if train_series.empty:
pred = np.full(horizon, fallback, dtype=float)
mode = "global-fallback"
else:
pred, mode = forecast_ets_with_fallback(
train_series["resid_anom"].to_numpy(dtype=float),
horizon=horizon,
seasonal_periods=seasonal_periods,
fallback_value=fallback,
)
mode_counts[mode] = mode_counts.get(mode, 0) + 1
val_series["pred_resid_anom"] = pred
val_series["pred_anom"] = val_series["exog_anom_pred"].to_numpy(dtype=float) + val_series[
"pred_resid_anom"
].to_numpy(dtype=float)
val_series["pred_sales"] = val_series["pred_anom"] + val_series["clim_sales"]
fold_preds.append(val_series)
if i % 400 == 0:
print(f" processed {i}/{total_series} series...")
fold_val_pred = pd.concat(fold_preds, ignore_index=True) if fold_preds else fold_val.assign(pred_sales=np.nan)
fold_metrics = summarize_metrics(fold_val_pred)
fold_metrics["runtime_sec"] = float(time.perf_counter() - t0)
fold_metrics["fold"] = fold_id
fold_metrics["train_start"] = str(pd.Timestamp(train_dates[0]).date())
fold_metrics["train_end"] = str(pd.Timestamp(train_dates[-1]).date())
fold_metrics["val_start"] = str(pd.Timestamp(val_dates[0]).date())
fold_metrics["val_end"] = str(pd.Timestamp(val_dates[-1]).date())
metrics_rows.append(fold_metrics)
print(
f" wmae={fold_metrics['wmae']:.4f} mae={fold_metrics['mae']:.4f} "
f"rmse={fold_metrics['rmse']:.4f} runtime={fold_metrics['runtime_sec']:.1f}s"
)
keep_cols = [
"Store",
"Dept",
"Date",
"IsHoliday",
"Weekly_Sales",
"sales_anom",
"pred_anom",
"pred_sales",
"series_id",
]
part = fold_val_pred[keep_cols].copy()
part["fold"] = fold_id
oof_parts.append(part)
metrics_df = pd.DataFrame(metrics_rows)
if not metrics_df.empty:
mean_row = {"fold": "mean"}
for col in ["wmae", "mae", "rmse", "anom_mae", "n_rows", "runtime_sec"]:
mean_row[col] = float(metrics_df[col].mean())
metrics_df = pd.concat([metrics_df, pd.DataFrame([mean_row])], ignore_index=True)
oof_df = pd.concat(oof_parts, ignore_index=True) if oof_parts else pd.DataFrame()
return metrics_df, oof_df, mode_counts
def fit_full_and_predict_test(
train_df: pd.DataFrame,
test_df: pd.DataFrame,
seasonal_periods: int,
exog_alpha: float,
) -> tuple[pd.DataFrame, dict[str, int]]:
climatology = compute_climatology(train_df)
full_train = attach_climatology(train_df.copy(), climatology)
full_test = attach_climatology(test_df.copy(), climatology)
exog_model = make_exog_model(exog_alpha)
exog_model.fit(full_train[EXOG_FEATURE_COLS], full_train["sales_anom"].to_numpy(dtype=float))
full_train["exog_anom_pred"] = np.asarray(exog_model.predict(full_train[EXOG_FEATURE_COLS]), dtype=float)
full_test["exog_anom_pred"] = np.asarray(exog_model.predict(full_test[EXOG_FEATURE_COLS]), dtype=float)
full_train["resid_anom"] = full_train["sales_anom"] - full_train["exog_anom_pred"]
fallback = float(full_train["resid_anom"].median())
out_parts: list[pd.DataFrame] = []
mode_counts: dict[str, int] = {}
test_series_ids = full_test["series_id"].drop_duplicates().tolist()
total_series = len(test_series_ids)
for i, series_id in enumerate(test_series_ids, start=1):
train_series = full_train[full_train["series_id"] == series_id].sort_values("Date")
test_series = full_test[full_test["series_id"] == series_id].sort_values("Date").copy()
horizon = test_series.shape[0]
if train_series.empty:
pred = np.full(horizon, fallback, dtype=float)
mode = "global-fallback"
else:
pred, mode = forecast_ets_with_fallback(
train_series["resid_anom"].to_numpy(dtype=float),
horizon=horizon,
seasonal_periods=seasonal_periods,
fallback_value=fallback,
)
mode_counts[mode] = mode_counts.get(mode, 0) + 1
test_series["pred_resid_anom"] = pred
test_series["pred_anom"] = test_series["exog_anom_pred"].to_numpy(dtype=float) + test_series[
"pred_resid_anom"
].to_numpy(dtype=float)
test_series["pred_sales"] = test_series["pred_anom"] + test_series["clim_sales"]
out_parts.append(
test_series[["Store", "Dept", "Date", "IsHoliday", "pred_sales", "pred_anom", "series_id"]]
)
if i % 400 == 0:
print(f" processed {i}/{total_series} series...")
pred_df = pd.concat(out_parts, ignore_index=True) if out_parts else pd.DataFrame()
return pred_df, mode_counts
def main() -> None:
args = parse_args()
warnings.filterwarnings("ignore", category=ConvergenceWarning)
output_dir = ensure_output_dir(Path(args.output_dir))
print("Loading data...")
train_df, test_df = load_data(Path(args.train_path), Path(args.test_path), max_series=args.max_series)
print(
f"train rows={train_df.shape[0]}, test rows={test_df.shape[0]}, "
f"train series={train_df['series_id'].nunique()}, test series={test_df['series_id'].nunique()}"
)
print("\nRunning forward-chaining CV...")
metrics_df, oof_df, cv_mode_counts = run_cv(
train_df=train_df,
n_folds=args.n_folds,
val_weeks=args.val_weeks,
seasonal_periods=args.seasonal_periods,
exog_alpha=args.exog_alpha,
)
metrics_path = output_dir / "metrics.csv"
metrics_df.to_csv(metrics_path, index=False)
print(f"Saved metrics -> {metrics_path}")
oof_path = output_dir / "oof.parquet"
oof_df.to_parquet(oof_path, index=False)
print(f"Saved OOF predictions -> {oof_path}")
print("\nFitting full train and forecasting test...")
test_pred, test_mode_counts = fit_full_and_predict_test(
train_df=train_df,
test_df=test_df,
seasonal_periods=args.seasonal_periods,
exog_alpha=args.exog_alpha,
)
test_path = output_dir / "test.parquet"
test_pred.to_parquet(test_path, index=False)
print(f"Saved test predictions -> {test_path}")
config = {
"model": "ets",
"target": "sales_anom (via exogenous + ETS residual)",
"seasonal_periods": args.seasonal_periods,
"exogenous_features": EXOG_FEATURE_COLS,
"exog_alpha": args.exog_alpha,
"n_folds": args.n_folds,
"val_weeks": args.val_weeks,
"max_series": args.max_series,
"cv_mode_counts": cv_mode_counts,
"test_mode_counts": test_mode_counts,
"train_path": args.train_path,
"test_path": args.test_path,
}
config_path = output_dir / "config.json"
config_path.write_text(json.dumps(config, indent=2), encoding="utf-8")
print(f"Saved config -> {config_path}")
phase5_main = main2.13 Phase 5 Run
run_with_argv(phase5_main, ["phase5_ets"])2.14 Phase 6 Definitions
from __future__ import annotations
import argparse
import json
from pathlib import Path
from sklearn.ensemble import AdaBoostRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Phase 7: AdaBoost anomaly baseline with recursive forward-chaining evaluation."
)
parser.add_argument("--train-path", default="train_feat.parquet")
parser.add_argument("--test-path", default="test_feat.parquet")
parser.add_argument("--output-dir", default="outputs/baselines/adaboost")
parser.add_argument("--n-folds", type=int, default=4)
parser.add_argument("--val-weeks", type=int, default=13)
parser.add_argument("--max-series", type=int, default=None)
parser.add_argument("--n-estimators", type=int, default=300)
parser.add_argument("--learning-rate", type=float, default=0.03)
parser.add_argument("--max-depth", type=int, default=3)
parser.add_argument("--loss", default="linear", choices=["linear", "square", "exponential"])
return parser.parse_args()
def make_model(
n_estimators: int,
learning_rate: float,
max_depth: int,
loss: str,
):
tree = DecisionTreeRegressor(max_depth=max_depth, random_state=42)
# Backward-compatible constructor: newer sklearn uses `estimator`,
# older versions use `base_estimator`.
try:
model = AdaBoostRegressor(
estimator=tree,
n_estimators=n_estimators,
learning_rate=learning_rate,
loss=loss,
random_state=42,
)
except TypeError:
model = AdaBoostRegressor(
base_estimator=tree,
n_estimators=n_estimators,
learning_rate=learning_rate,
loss=loss,
random_state=42,
)
return Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("model", model),
]
)
def main() -> None:
args = parse_args()
output_dir = ensure_output_dir(Path(args.output_dir))
print("Loading data...")
train_df, test_df = load_data(Path(args.train_path), Path(args.test_path), max_series=args.max_series)
print(
f"train rows={train_df.shape[0]}, test rows={test_df.shape[0]}, "
f"train series={train_df['series_id'].nunique()}, test series={test_df['series_id'].nunique()}"
)
print("\nRunning recursive forward-chaining CV...")
metrics_df, oof_df = run_cv_tabular_baseline_anomaly(
train_df,
model_builder=lambda: make_model(
args.n_estimators,
args.learning_rate,
args.max_depth,
args.loss,
),
n_folds=args.n_folds,
val_weeks=args.val_weeks,
lag_orders=LAG_ORDERS,
feature_cols=ANOM_FEATURE_COLS,
)
metrics_path = output_dir / "metrics.csv"
metrics_df.to_csv(metrics_path, index=False)
print(f"Saved metrics -> {metrics_path}")
oof_path = output_dir / "oof.parquet"
oof_df.to_parquet(oof_path, index=False)
print(f"Saved OOF predictions -> {oof_path}")
print("\nFitting full train and forecasting test recursively...")
test_pred = fit_full_train_and_predict_test_tabular_anomaly(
train_df,
test_df,
model_builder=lambda: make_model(
args.n_estimators,
args.learning_rate,
args.max_depth,
args.loss,
),
lag_orders=LAG_ORDERS,
feature_cols=ANOM_FEATURE_COLS,
)
test_path = output_dir / "test.parquet"
test_pred.to_parquet(test_path, index=False)
print(f"Saved test predictions -> {test_path}")
config = {
"model": "adaboost",
"target": "sales_anom",
"features": ANOM_FEATURE_COLS,
"lags": list(LAG_ORDERS),
"n_estimators": args.n_estimators,
"learning_rate": args.learning_rate,
"max_depth": args.max_depth,
"loss": args.loss,
"n_folds": args.n_folds,
"val_weeks": args.val_weeks,
"max_series": args.max_series,
"train_path": args.train_path,
"test_path": args.test_path,
}
config_path = output_dir / "config.json"
config_path.write_text(json.dumps(config, indent=2), encoding="utf-8")
print(f"Saved config -> {config_path}")
phase6_main = main2.15 Phase 6 Run
run_with_argv(phase6_main, ["phase6_adaboost"])3 Data and Evaluation Setup
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
PANTONE_GREEN = "#00A651" # Pantone-style green (approx RGB equivalent)
PANTONE_ORANGE = "#FF6A13" # Pantone-style orange (approx RGB equivalent)
def find_project_root() -> Path:
candidates = [Path("."), Path(".."), Path("../..")]
for c in candidates:
if (c / "outputs" / "metrics" / "local_linear_cv.csv").exists():
return c.resolve()
raise FileNotFoundError("Could not locate project root with outputs/metrics/local_linear_cv.csv")
def split_fold_and_mean(cv_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
fold = cv_df[cv_df["fold"].astype(str) != "mean"].copy()
mean = cv_df[cv_df["fold"].astype(str) == "mean"].iloc[0]
return fold, mean
def weekly_actual_vs_pred(oof_df: pd.DataFrame) -> pd.DataFrame:
agg_map: dict[str, tuple[str, str]] = {
"actual_sales": ("Weekly_Sales", "sum"),
"pred_sales": ("pred_sales", "sum"),
}
if {"pred_sales_lower", "pred_sales_upper"}.issubset(oof_df.columns):
agg_map["pred_lower"] = ("pred_sales_lower", "sum")
agg_map["pred_upper"] = ("pred_sales_upper", "sum")
out = oof_df.groupby("Date", as_index=False).agg(**agg_map).sort_values("Date")
return out
def plot_weekly_actual_vs_pred(
oof_df: pd.DataFrame,
title: str,
pred_color: str = PANTONE_ORANGE,
show_uncertainty: bool = False,
) -> None:
weekly = weekly_actual_vs_pred(oof_df)
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(weekly["Date"], weekly["actual_sales"], color=PANTONE_GREEN, linewidth=2, label="Actual sales")
ax.plot(weekly["Date"], weekly["pred_sales"], color=pred_color, linewidth=2, label="Predicted sales")
if show_uncertainty and {"pred_lower", "pred_upper"}.issubset(weekly.columns):
ax.fill_between(
weekly["Date"],
weekly["pred_lower"],
weekly["pred_upper"],
color=pred_color,
alpha=0.2,
label="Predictive interval",
)
ax.set_title(title)
ax.set_xlabel("Date")
ax.set_ylabel("Aggregated Weekly Sales")
ax.grid(alpha=0.25)
ax.legend()
plt.show()
def make_param_df(entries: list[tuple[str, object, str]]) -> pd.DataFrame:
return pd.DataFrame(entries, columns=["Parameter", "Value", "How it works"])
ROOT = find_project_root()
OUTPUTS = ROOT / "outputs"
BASELINES = OUTPUTS / "baselines"
# Phase 1 (full data)
p1_cv = pd.read_csv(OUTPUTS / "metrics" / "local_linear_cv.csv")
p1_oof = pd.read_parquet(OUTPUTS / "preds" / "local_linear_oof.parquet")
p1_cfg = json.loads((OUTPUTS / "metrics" / "local_linear_config.json").read_text(encoding="utf-8"))
p1_lite_cv = pd.read_csv(OUTPUTS / "phase1_local_linear_lite" / "metrics" / "local_linear_cv.csv")
p1_lite_oof = pd.read_parquet(OUTPUTS / "phase1_local_linear_lite" / "preds" / "local_linear_oof.parquet")
p1_lite_cfg = json.loads(
(OUTPUTS / "phase1_local_linear_lite" / "metrics" / "local_linear_config.json").read_text(encoding="utf-8")
)
grid_results = pd.read_csv(OUTPUTS / "search" / "local_linear_grid_results.csv")
best_grid_cfg = json.loads((OUTPUTS / "search" / "best_local_linear_config.json").read_text(encoding="utf-8"))
# Phase 2 (full data)
p2_cv = pd.read_csv(OUTPUTS / "phase2_structural_ar_full" / "metrics" / "structural_ar_cv.csv")
p2_oof = pd.read_parquet(OUTPUTS / "phase2_structural_ar_full" / "preds" / "structural_ar_oof.parquet")
p2_cfg = json.loads((OUTPUTS / "phase2_structural_ar_full" / "metrics" / "structural_ar_config.json").read_text(encoding="utf-8"))
# Phase 3, 4, 5, 7 baselines
en_cv = pd.read_csv(BASELINES / "elastic_net" / "metrics.csv")
en_oof = pd.read_parquet(BASELINES / "elastic_net" / "oof.parquet")
en_cfg = json.loads((BASELINES / "elastic_net" / "config.json").read_text(encoding="utf-8"))
rf_cv = pd.read_csv(BASELINES / "random_forest" / "metrics.csv")
rf_oof = pd.read_parquet(BASELINES / "random_forest" / "oof.parquet")
rf_cfg = json.loads((BASELINES / "random_forest" / "config.json").read_text(encoding="utf-8"))
ets_cv = pd.read_csv(BASELINES / "ets" / "metrics.csv")
ets_oof = pd.read_parquet(BASELINES / "ets" / "oof.parquet")
ets_cfg = json.loads((BASELINES / "ets" / "config.json").read_text(encoding="utf-8"))
ad_cv = pd.read_csv(BASELINES / "adaboost" / "metrics.csv")
ad_oof = pd.read_parquet(BASELINES / "adaboost" / "oof.parquet")
ad_cfg = json.loads((BASELINES / "adaboost" / "config.json").read_text(encoding="utf-8"))
for df in [p1_oof, p1_lite_oof, p2_oof, en_oof, rf_oof, ets_oof, ad_oof]:
df["Date"] = pd.to_datetime(df["Date"])
p1_fold, p1_mean = split_fold_and_mean(p1_cv)
p1_lite_fold, p1_lite_mean = split_fold_and_mean(p1_lite_cv)
p2_fold, p2_mean = split_fold_and_mean(p2_cv)
en_fold, en_mean = split_fold_and_mean(en_cv)
rf_fold, rf_mean = split_fold_and_mean(rf_cv)
ets_fold, ets_mean = split_fold_and_mean(ets_cv)
ad_fold, ad_mean = split_fold_and_mean(ad_cv)
artifact_summary = pd.DataFrame(
[
{"Artifact": "Phase 1 OOF rows", "Value": int(p1_oof.shape[0])},
{"Artifact": "Phase 1-lite OOF rows", "Value": int(p1_lite_oof.shape[0])},
{"Artifact": "Phase 2 OOF rows", "Value": int(p2_oof.shape[0])},
{"Artifact": "Elastic Net OOF rows", "Value": int(en_oof.shape[0])},
{"Artifact": "Random Forest OOF rows", "Value": int(rf_oof.shape[0])},
{"Artifact": "ETS OOF rows", "Value": int(ets_oof.shape[0])},
{"Artifact": "AdaBoost OOF rows", "Value": int(ad_oof.shape[0])},
]
)
artifact_summary| Artifact | Value | |
|---|---|---|
| 0 | Phase 1 OOF rows | 154386 |
| 1 | Phase 1-lite OOF rows | 154386 |
| 2 | Phase 2 OOF rows | 154386 |
| 3 | Elastic Net OOF rows | 154386 |
| 4 | Random Forest OOF rows | 154386 |
| 5 | ETS OOF rows | 154386 |
| 6 | AdaBoost OOF rows | 154386 |
Validation metric:
\[ \text{WMAE} = \frac{\sum_i w_i \lvert y_i - \hat{y}_i \rvert}{\sum_i w_i}, \quad w_i = \begin{cases} 5, & \text{if holiday} \\ 1, & \text{otherwise} \end{cases} \]
4 Model 1: Local Linear Anomaly (Phase 1)
Local Linear is a weighted regression that learns a different local coefficient vector for each week-of-year neighborhood and each series. It is simple and interpretable, but sensitive to feature scaling, so standardization is applied before fitting.
4.1 Mathematical Formulation
For series (s) and target seasonal week (w):
\[ \hat{\beta}_{s,w} = \arg\min_{\beta} \sum_{i \in s} K_h\!\left(d(\text{week}_i,w)\right) \left(y_i - \beta_0 - x_i^\top \beta\right)^2 + \lambda \|\beta\|_2^2 \]
and recursive prediction:
\[ \hat{y}^{anom}_{t,s}=\beta_0 + x_{t,s}^\top \beta,\qquad \hat{y}_{t,s}=\hat{y}^{anom}_{t,s}+clim_{t,s} \]
4.2 Plain-Language Meaning of Terms
series s: one specificStore + Depttime series.week_iandw: historical week index and target seasonal week where local fitting is centered.K_h(d(.)): a weight that gives more importance to rows close in seasonal calendar (e.g., nearby weeks of year).x_i: the input features for a row (lags, holiday flag, exogenous variables).beta_0, beta: intercept and coefficients learned by local weighted regression.lambda: regularization strength that shrinks coefficients to avoid unstable fits.y^{anom}: anomaly target (real sales minus climatology baseline).clim_{t,s}: baseline seasonal level added back to convert anomaly prediction to sales prediction.
4.3 Workflow
flowchart LR
A[Read train/test parquet] --> B[Compute climatology per series/store/week]
B --> C[Create anomalies and lag1/lag2]
C --> D[Impute missing lag/exogenous values]
D --> E[Standard-scale Phase 1 features]
E --> F[Forward-chaining CV]
F --> G[Fit local weighted ridge by series and seasonal week]
G --> H[Recursive rollout on validation horizon]
H --> I[OOF metrics and predictions]
I --> J[Fit full train and forecast test]flowchart LR A[Read train/test parquet] --> B[Compute climatology per series/store/week] B --> C[Create anomalies and lag1/lag2] C --> D[Impute missing lag/exogenous values] D --> E[Standard-scale Phase 1 features] E --> F[Forward-chaining CV] F --> G[Fit local weighted ridge by series and seasonal week] G --> H[Recursive rollout on validation horizon] H --> I[OOF metrics and predictions] I --> J[Fit full train and forecast test]
4.4 Tuned Parameters and Effect
make_param_df(
[
("kernel", p1_cfg["kernel"], "Kernel shape over seasonal distance; controls local weighting profile."),
("bandwidth", p1_cfg["bandwidth"], "Neighborhood width in week-of-year units; larger means smoother seasonal fit."),
("min_samples", p1_cfg["min_samples"], "Minimum active samples to fit a local model; guards against unstable fits."),
("ridge", p1_cfg["ridge"], "L2 regularization strength in weighted regression."),
("coef_clip", p1_cfg["coef_clip"], "Post-fit coefficient clipping to prevent extreme local slopes."),
("anom_clip_scale", p1_cfg["anom_clip_scale"], "Prediction clipping band around anomaly quantiles for recursive stability."),
("lags", p1_cfg["lags"], "Autoregressive inputs used in recursive forecasting."),
("standard_scaler", p1_cfg.get("standard_scaler", True), "Feature standardization before fitting local regressions."),
]
)| Parameter | Value | How it works | |
|---|---|---|---|
| 0 | kernel | tricube | Kernel shape over seasonal distance; controls ... |
| 1 | bandwidth | 6 | Neighborhood width in week-of-year units; larg... |
| 2 | min_samples | 16 | Minimum active samples to fit a local model; g... |
| 3 | ridge | 0.0001 | L2 regularization strength in weighted regress... |
| 4 | coef_clip | 6.0 | Post-fit coefficient clipping to prevent extre... |
| 5 | anom_clip_scale | 2.0 | Prediction clipping band around anomaly quanti... |
| 6 | lags | [1, 2] | Autoregressive inputs used in recursive foreca... |
| 7 | standard_scaler | True | Feature standardization before fitting local r... |
p1_cfg{'kernel': 'tricube',
'bandwidth': 6,
'min_samples': 16,
'ridge': 0.0001,
'coef_clip': 6.0,
'anom_clip_scale': 2.0,
'lags': [1, 2],
'use_interactions': False,
'interaction_cols': [],
'feature_cols': ['temp_anom',
'fuel_anom',
'MarkDown1',
'MarkDown2',
'MarkDown3',
'MarkDown4',
'MarkDown5',
'CPI',
'Unemployment',
'sales_anom_lag1',
'sales_anom_lag2',
'is_holiday_int'],
'standard_scaler': True,
'n_folds': 4,
'val_weeks': 13,
'max_series': None}
p1_fold[["fold", "train_start", "train_end", "val_start", "val_end", "wmae", "mae", "rmse"]]| fold | train_start | train_end | val_start | val_end | wmae | mae | rmse | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 2011-10-28 | 2011-11-04 | 2012-01-27 | 5989.774614 | 5916.085624 | 11465.394577 |
| 1 | 2 | 2010-02-05 | 2012-01-27 | 2012-02-03 | 2012-04-27 | 10061.944470 | 10472.387681 | 20457.695819 |
| 2 | 3 | 2010-02-05 | 2012-04-27 | 2012-05-04 | 2012-07-27 | 11956.737489 | 11956.737489 | 21420.299578 |
| 3 | 4 | 2010-02-05 | 2012-07-27 | 2012-08-03 | 2012-10-26 | 12294.668865 | 12479.604773 | 22343.880598 |
p1_mean[["wmae", "mae", "rmse"]]wmae 10075.78136
mae 10206.203892
rmse 18921.817643
Name: 4, dtype: object
5 Model 1-lite: Local Linear Diagnostic Variant
Phase 1-lite keeps the same Local Linear method but removes the extra exogenous variables (MarkDown1-5, CPI, Unemployment) and uses only temp_anom, fuel_anom plus lags/holiday.
p1_lite_cfg{'kernel': 'tricube',
'bandwidth': 6,
'min_samples': 16,
'ridge': 0.0001,
'coef_clip': 6.0,
'anom_clip_scale': 2.0,
'lags': [1, 2],
'feature_mode': 'lite',
'exogenous_features': ['temp_anom', 'fuel_anom'],
'use_interactions': False,
'interaction_cols': [],
'feature_cols': ['temp_anom',
'fuel_anom',
'sales_anom_lag1',
'sales_anom_lag2',
'is_holiday_int'],
'standard_scaler': True,
'n_folds': 4,
'val_weeks': 13,
'max_series': None}
p1_lite_fold[["fold", "train_start", "train_end", "val_start", "val_end", "wmae", "mae", "rmse"]]| fold | train_start | train_end | val_start | val_end | wmae | mae | rmse | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 2011-10-28 | 2011-11-04 | 2012-01-27 | 2468.567458 | 2271.802436 | 4955.291247 |
| 1 | 2 | 2010-02-05 | 2012-01-27 | 2012-02-03 | 2012-04-27 | 2199.682469 | 2196.184633 | 4485.720535 |
| 2 | 3 | 2010-02-05 | 2012-04-27 | 2012-05-04 | 2012-07-27 | 2028.753667 | 2028.753667 | 3948.262807 |
| 3 | 4 | 2010-02-05 | 2012-07-27 | 2012-08-03 | 2012-10-26 | 1947.742271 | 1923.235682 | 3954.874629 |
p1_lite_mean[["wmae", "mae", "rmse"]]wmae 2161.186467
mae 2104.994105
rmse 4336.037305
Name: 4, dtype: object
6 Model 2: Bayesian Structural AR (Phase 2)
Structural AR combines hierarchical intercepts, autoregressive lags, exogenous drivers, and seasonal Fourier terms in one probabilistic model. It is designed to capture both shared structure and series-specific behavior.
6.1 Mathematical Formulation
In z-score space:
\[ y^{*}_{t,s} \sim \mathcal{N}(\mu_{t,s}, \sigma) \]
\[ \mu_{t,s} = \alpha_s + \beta_{lag}^\top lag_{t,s} + \beta_{exog}^\top exog_{t,s} + \beta_h h_{t,s} + \beta_{tr} trend_t + \beta_f^\top fourier_t \]
with hierarchical intercept:
\[ \alpha_s = \alpha_\mu + \alpha_\sigma z_s,\qquad z_s \sim \mathcal{N}(0,1) \]
6.2 Plain-Language Meaning of Terms
y*_{t,s}: normalized anomaly target for weektand seriess.mu_{t,s}: model mean prediction before adding random noise.alpha_s: series-specific baseline level; eachStore + Depthas its own intercept.beta_lag,lag_{t,s}: coefficients and lag features that capture autoregressive behavior.beta_exog,exog_{t,s}: effects of external drivers (temperature, markdown flags, CPI, unemployment, etc.).beta_h h_{t,s}: holiday contribution.beta_tr trend_t: long-run drift over time.beta_f fourier_t: smooth yearly seasonal pattern via sine/cosine terms.sigma: predictive noise level around the mean.
6.3 Workflow
flowchart LR
A[Read train parquet] --> B[Compute climatology]
B --> C[Create anomaly target and lag1/lag2]
C --> D[Build trend and Fourier seasonal terms]
D --> E[Fill exogenous NA from train medians]
E --> F[Standardize lag/exogenous/trend variables]
F --> G[Fit PyMC structural model via MAP]
G --> H[Recursive probabilistic rollout by date]
H --> I[Aggregate draws to mean/sd and intervals]
I --> J[CV metrics and OOF outputs]flowchart LR A[Read train parquet] --> B[Compute climatology] B --> C[Create anomaly target and lag1/lag2] C --> D[Build trend and Fourier seasonal terms] D --> E[Fill exogenous NA from train medians] E --> F[Standardize lag/exogenous/trend variables] F --> G[Fit PyMC structural model via MAP] G --> H[Recursive probabilistic rollout by date] H --> I[Aggregate draws to mean/sd and intervals] I --> J[CV metrics and OOF outputs]
6.4 Tuned Parameters and Effect
make_param_df(
[
("max_eval", p2_cfg["max_eval"], "Maximum evaluations in MAP optimization; higher allows deeper convergence."),
("pred_draws", p2_cfg["pred_draws"], "Number of stochastic recursive draws for predictive mean/uncertainty."),
("fourier_order", p2_cfg["fourier_order"], "Number of sine/cosine seasonal harmonics."),
("lag_orders", p2_cfg["lag_orders"], "Autoregressive lags used as structural predictors."),
("sigma_clusters", p2_cfg["sigma_clusters"], "Number of heteroskedastic noise clusters; 0 means shared sigma."),
("exogenous_features", p2_cfg.get("exogenous_features", []), "External covariates entering the structural mean."),
]
)| Parameter | Value | How it works | |
|---|---|---|---|
| 0 | max_eval | 1800 | Maximum evaluations in MAP optimization; highe... |
| 1 | pred_draws | 40 | Number of stochastic recursive draws for predi... |
| 2 | fourier_order | 3 | Number of sine/cosine seasonal harmonics. |
| 3 | lag_orders | [1, 2] | Autoregressive lags used as structural predict... |
| 4 | sigma_clusters | 0 | Number of heteroskedastic noise clusters; 0 me... |
| 5 | exogenous_features | [temp_anom, fuel_anom, MarkDown1, MarkDown2, M... | External covariates entering the structural mean. |
p2_cfg{'n_folds': 4,
'val_weeks': 13,
'max_eval': 1800,
'random_seed': 8927,
'pred_draws': 40,
'fourier_order': 3,
'lag_orders': [1, 2],
'exogenous_features': ['temp_anom',
'fuel_anom',
'MarkDown1',
'MarkDown2',
'MarkDown3',
'MarkDown4',
'MarkDown5',
'CPI',
'Unemployment'],
'sigma_clusters': 0,
'max_series': 3331,
'note': 'Structural AR model with hierarchical series intercept, trend, Fourier seasonality, and recursive validation.'}
p2_fold[["fold", "train_start", "train_end", "val_start", "val_end", "wmae", "mae", "rmse", "runtime_sec"]]| fold | train_start | train_end | val_start | val_end | wmae | mae | rmse | runtime_sec | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 2011-10-28 | 2011-11-04 | 2012-01-27 | 2514.047613 | 2354.646140 | 4935.317829 | 64.718200 |
| 1 | 2 | 2010-02-05 | 2012-01-27 | 2012-02-03 | 2012-04-27 | 2192.877850 | 2207.494598 | 4400.426532 | 68.368072 |
| 2 | 3 | 2010-02-05 | 2012-04-27 | 2012-05-04 | 2012-07-27 | 2138.504577 | 2138.504577 | 3799.486079 | 68.047948 |
| 3 | 4 | 2010-02-05 | 2012-07-27 | 2012-08-03 | 2012-10-26 | 1904.502883 | 1885.828366 | 3620.883291 | 65.612177 |
p2_mean[["wmae", "mae", "rmse", "runtime_sec"]]wmae 2187.483231
mae 2146.61842
rmse 4189.028433
runtime_sec 66.686599
Name: 4, dtype: object
7 Model 3: Elastic Net Anomaly Baseline (Phase 3)
Elastic Net is a linear model with combined L1/L2 regularization. It is a robust baseline for correlated tabular features and keeps an interpretable global linear structure.
7.1 Mathematical Formulation
\[ \hat{\beta} = \arg\min_{\beta} \frac{1}{2n}\|y - X\beta\|_2^2 + \alpha\left( \frac{1-l1\_ratio}{2}\|\beta\|_2^2 + l1\_ratio\|\beta\|_1 \right) \]
7.2 Plain-Language Meaning of Terms
X beta: linear combination of all features.||y - X beta||^2: fit error term (how far predictions are from truth).alpha: total regularization strength.l1_ratio: balance between:- L1 penalty: pushes less-useful coefficients to exactly zero (feature selection behavior).
- L2 penalty: smoothly shrinks coefficients for stability under correlated features.
7.3 Workflow
flowchart LR
A[Read train/test parquet] --> B[Compute climatology and anomalies]
B --> C[Create lag1/lag2 and calendar features]
C --> D[Build exogenous feature matrix]
D --> E[Median imputation + standard scaling]
E --> F[Fit Elastic Net on train folds]
F --> G[Recursive forecasting on validation horizon]
G --> H[OOF metrics and predictions]
H --> I[Fit full train and forecast test]flowchart LR A[Read train/test parquet] --> B[Compute climatology and anomalies] B --> C[Create lag1/lag2 and calendar features] C --> D[Build exogenous feature matrix] D --> E[Median imputation + standard scaling] E --> F[Fit Elastic Net on train folds] F --> G[Recursive forecasting on validation horizon] G --> H[OOF metrics and predictions] H --> I[Fit full train and forecast test]
7.4 Tuned Parameters and Effect
make_param_df(
[
("alpha", en_cfg["alpha"], "Overall regularization strength; larger values shrink coefficients more."),
("l1_ratio", en_cfg["l1_ratio"], "Mix between L1 (sparsity) and L2 (stability)."),
("max_iter", en_cfg["max_iter"], "Maximum coordinate-descent iterations for convergence."),
("lags", en_cfg["lags"], "Lag features used in recursive setup."),
("features", en_cfg["features"], "Full tabular interface for model fitting."),
]
)| Parameter | Value | How it works | |
|---|---|---|---|
| 0 | alpha | 0.02 | Overall regularization strength; larger values... |
| 1 | l1_ratio | 0.2 | Mix between L1 (sparsity) and L2 (stability). |
| 2 | max_iter | 10000 | Maximum coordinate-descent iterations for conv... |
| 3 | lags | [1, 2] | Lag features used in recursive setup. |
| 4 | features | [lag1, lag2, week_of_year, month, is_holiday_i... | Full tabular interface for model fitting. |
en_cfg{'model': 'elastic_net',
'target': 'sales_anom',
'features': ['lag1',
'lag2',
'week_of_year',
'month',
'is_holiday_int',
'temp_anom',
'fuel_anom',
'MarkDown1',
'MarkDown2',
'MarkDown3',
'MarkDown4',
'MarkDown5',
'CPI',
'Unemployment'],
'lags': [1, 2],
'alpha': 0.02,
'l1_ratio': 0.2,
'max_iter': 10000,
'n_folds': 4,
'val_weeks': 13,
'max_series': None,
'train_path': 'train_feat.parquet',
'test_path': 'test_feat.parquet'}
en_fold[["fold", "train_start", "train_end", "val_start", "val_end", "wmae", "mae", "rmse", "runtime_sec"]]| fold | train_start | train_end | val_start | val_end | wmae | mae | rmse | runtime_sec | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 2011-10-28 | 2011-11-04 | 2012-01-27 | 2472.678568 | 2292.791482 | 4919.584363 | 6.225060 |
| 1 | 2 | 2010-02-05 | 2012-01-27 | 2012-02-03 | 2012-04-27 | 2163.772249 | 2175.479837 | 4420.761374 | 8.105888 |
| 2 | 3 | 2010-02-05 | 2012-04-27 | 2012-05-04 | 2012-07-27 | 2058.784472 | 2058.784472 | 3793.832714 | 7.717091 |
| 3 | 4 | 2010-02-05 | 2012-07-27 | 2012-08-03 | 2012-10-26 | 1833.856224 | 1803.861985 | 3645.865425 | 12.398858 |
en_mean[["wmae", "mae", "rmse", "runtime_sec"]]wmae 2132.272878
mae 2082.729444
rmse 4195.010969
runtime_sec 8.611724
Name: 4, dtype: object
8 Model 4: Random Forest Anomaly Baseline (Phase 4)
Random Forest is an ensemble of decision trees trained on bootstrap samples. It captures nonlinear interactions with little feature engineering and is robust to mixed feature scales.
8.1 Mathematical Formulation
For (B) trees:
\[ \hat{y}(x)=\frac{1}{B}\sum_{b=1}^{B} T_b(x) \]
where each (T_b) is grown on a bootstrap sample and split candidates are randomized via max_features.
8.2 Plain-Language Meaning of Terms
T_b(x): prediction from treeb.B: number of trees in the forest.- Final prediction is an average of trees, which reduces variance and improves robustness.
- Bootstrap sampling means each tree sees a slightly different sampled training set.
- Random split-feature selection (
max_features) decorrelates trees, improving ensemble quality.
8.3 Workflow
flowchart LR
A[Read train/test parquet] --> B[Compute climatology and anomalies]
B --> C[Create lag1/lag2 + calendar + exogenous features]
C --> D[Median imputation]
D --> E[Train Random Forest on fold train]
E --> F[Recursive multi-step rollout on fold validation]
F --> G[OOF metrics and predictions]
G --> H[Refit on full train and forecast test]flowchart LR A[Read train/test parquet] --> B[Compute climatology and anomalies] B --> C[Create lag1/lag2 + calendar + exogenous features] C --> D[Median imputation] D --> E[Train Random Forest on fold train] E --> F[Recursive multi-step rollout on fold validation] F --> G[OOF metrics and predictions] G --> H[Refit on full train and forecast test]
8.4 Tuned Parameters and Effect
make_param_df(
[
("n_estimators", rf_cfg["n_estimators"], "Number of trees; larger reduces variance but increases runtime."),
("max_depth", rf_cfg["max_depth"], "Maximum tree depth; controls model complexity."),
("min_samples_leaf", rf_cfg["min_samples_leaf"], "Minimum samples per leaf; regularizes tree partitions."),
("max_features", rf_cfg["max_features"], "Feature subset size considered at each split."),
("lags", rf_cfg["lags"], "Autoregressive lag features used recursively."),
]
)| Parameter | Value | How it works | |
|---|---|---|---|
| 0 | n_estimators | 120 | Number of trees; larger reduces variance but i... |
| 1 | max_depth | 18 | Maximum tree depth; controls model complexity. |
| 2 | min_samples_leaf | 2 | Minimum samples per leaf; regularizes tree par... |
| 3 | max_features | sqrt | Feature subset size considered at each split. |
| 4 | lags | [1, 2] | Autoregressive lag features used recursively. |
rf_cfg{'model': 'random_forest',
'target': 'sales_anom',
'features': ['lag1',
'lag2',
'week_of_year',
'month',
'is_holiday_int',
'temp_anom',
'fuel_anom',
'MarkDown1',
'MarkDown2',
'MarkDown3',
'MarkDown4',
'MarkDown5',
'CPI',
'Unemployment'],
'lags': [1, 2],
'n_estimators': 120,
'max_depth': 18,
'min_samples_leaf': 2,
'max_features': 'sqrt',
'n_folds': 4,
'val_weeks': 13,
'max_series': None,
'train_path': 'train_feat.parquet',
'test_path': 'test_feat.parquet'}
rf_fold[["fold", "train_start", "train_end", "val_start", "val_end", "wmae", "mae", "rmse", "runtime_sec"]]| fold | train_start | train_end | val_start | val_end | wmae | mae | rmse | runtime_sec | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 2011-10-28 | 2011-11-04 | 2012-01-27 | 2443.770022 | 2243.909149 | 4898.711983 | 14.831643 |
| 1 | 2 | 2010-02-05 | 2012-01-27 | 2012-02-03 | 2012-04-27 | 2132.907277 | 2148.134375 | 4393.405264 | 20.781198 |
| 2 | 3 | 2010-02-05 | 2012-04-27 | 2012-05-04 | 2012-07-27 | 1915.343992 | 1915.343992 | 3675.397679 | 19.938103 |
| 3 | 4 | 2010-02-05 | 2012-07-27 | 2012-08-03 | 2012-10-26 | 1738.982368 | 1719.540629 | 3554.458799 | 19.019421 |
rf_mean[["wmae", "mae", "rmse", "runtime_sec"]]wmae 2057.750915
mae 2006.732036
rmse 4130.493432
runtime_sec 18.642591
Name: 4, dtype: object
9 Model 5: ETS Anomaly Baseline (Phase 5)
ETS here is used on residual anomalies after removing an exogenous linear component. This hybrid keeps classical exponential smoothing dynamics while allowing external regressors to explain systematic variation.
9.1 Mathematical Formulation
Exogenous residual decomposition:
\[ y^{anom}_{t,s}=g(x_{t,s}) + r_{t,s},\qquad g(\cdot)\text{ from Ridge regression} \]
Residual component (r_{t,s}) is modeled by ETS:
\[ r_{t,s} = \ell_{t-1,s}+b_{t-1,s}+s_{t-m,s}+\varepsilon_{t,s} \]
and final forecast:
\[ \hat{y}_{t,s}=clim_{t,s}+\hat{g}(x_{t,s})+\hat{r}_{t,s} \]
9.2 Plain-Language Explanation (What ETS Is)
ETS stands for Error, Trend, Seasonality. It is a classical time-series model that keeps three internal states and updates them week by week:
level: the current baseline sales level for the series.trend: the direction/slope (increasing or decreasing behavior).seasonality: repeating seasonal pattern (here around yearly weekly cycle).
In this project we first predict the anomaly using exogenous variables (g(x)), then ETS models the remaining residual pattern (r). The final prediction is:
- exogenous part
- plus ETS residual dynamics
- plus climatology baseline to return to sales scale.
9.3 Plain-Language Meaning of Terms
y^{anom}: anomaly sales target.g(x): exogenous linear predictor (Ridge) from external variables.r_{t,s}: residual anomaly after removing exogenous part.ell: level state.b: trend state.s: seasonal state.epsilon: random error term.
9.4 Workflow
flowchart LR
A[Read train/test parquet] --> B[Compute climatology and anomalies]
B --> C[Fit exogenous Ridge on anomaly target]
C --> D[Compute residual anomaly = target - exogenous prediction]
D --> E[Fit per-series ETS on residuals with fallback candidates]
E --> F[Recursive fold forecasting by series]
F --> G[Add exogenous component + climatology back]
G --> H[OOF metrics and test forecasts]flowchart LR A[Read train/test parquet] --> B[Compute climatology and anomalies] B --> C[Fit exogenous Ridge on anomaly target] C --> D[Compute residual anomaly = target - exogenous prediction] D --> E[Fit per-series ETS on residuals with fallback candidates] E --> F[Recursive fold forecasting by series] F --> G[Add exogenous component + climatology back] G --> H[OOF metrics and test forecasts]
9.5 Tuned Parameters and Effect
make_param_df(
[
("seasonal_periods", ets_cfg["seasonal_periods"], "Season length used by ETS seasonal state."),
("exog_alpha", ets_cfg.get("exog_alpha", None), "Ridge penalty for exogenous linear component."),
("exogenous_features", ets_cfg.get("exogenous_features", []), "Regressors used before ETS residual modeling."),
("n_folds", ets_cfg["n_folds"], "Forward-chaining fold count."),
("val_weeks", ets_cfg["val_weeks"], "Validation horizon length per fold."),
]
)| Parameter | Value | How it works | |
|---|---|---|---|
| 0 | seasonal_periods | 52 | Season length used by ETS seasonal state. |
| 1 | exog_alpha | 1.0 | Ridge penalty for exogenous linear component. |
| 2 | exogenous_features | [temp_anom, fuel_anom, MarkDown1, MarkDown2, M... | Regressors used before ETS residual modeling. |
| 3 | n_folds | 4 | Forward-chaining fold count. |
| 4 | val_weeks | 13 | Validation horizon length per fold. |
ets_cfg{'model': 'ets',
'target': 'sales_anom (via exogenous + ETS residual)',
'seasonal_periods': 52,
'exogenous_features': ['temp_anom',
'fuel_anom',
'MarkDown1',
'MarkDown2',
'MarkDown3',
'MarkDown4',
'MarkDown5',
'CPI',
'Unemployment'],
'exog_alpha': 1.0,
'n_folds': 4,
'val_weeks': 13,
'max_series': None,
'cv_mode_counts': {'level': 10599,
'level-trend': 1796,
'global-fallback': 77,
'last-value-fallback': 51,
'level-seasonal': 4},
'test_mode_counts': {'level': 2758,
'level-trend': 372,
'level-seasonal': 8,
'global-fallback': 11,
'last-value-fallback': 20},
'train_path': 'train_feat.parquet',
'test_path': 'test_feat.parquet'}
ets_fold[["fold", "train_start", "train_end", "val_start", "val_end", "wmae", "mae", "rmse", "runtime_sec"]]| fold | train_start | train_end | val_start | val_end | wmae | mae | rmse | runtime_sec | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 2011-10-28 | 2011-11-04 | 2012-01-27 | 2350.860561 | 2193.524621 | 4865.080883 | 137.155356 |
| 1 | 2 | 2010-02-05 | 2012-01-27 | 2012-02-03 | 2012-04-27 | 1965.952051 | 1969.586155 | 4035.608919 | 839.189260 |
| 2 | 3 | 2010-02-05 | 2012-04-27 | 2012-05-04 | 2012-07-27 | 1585.766265 | 1585.766265 | 3052.148316 | 952.329147 |
| 3 | 4 | 2010-02-05 | 2012-07-27 | 2012-08-03 | 2012-10-26 | 1436.435312 | 1435.277361 | 2961.666930 | 974.137173 |
ets_mean[["wmae", "mae", "rmse", "runtime_sec"]]wmae 1834.753547
mae 1796.0386
rmse 3728.626262
runtime_sec 725.702734
Name: 4, dtype: object
10 Model 6: AdaBoost Anomaly Baseline (Phase 7)
AdaBoost builds an additive ensemble of shallow trees, where each stage focuses more on difficult residual patterns from previous stages.
10.1 Mathematical Formulation
With base learners (h_m):
\[ F_M(x)=\sum_{m=1}^{M}\nu_m h_m(x) \]
where stage weights depend on chosen boosting loss and learning rate. In practice here, (h_m) are depth-limited regression trees.
10.2 Plain-Language Meaning of Terms
h_m(x): base learner at stagem(small regression tree).nu_m: effective contribution of stagemto the final prediction.M: number of boosting stages.learning_rate: shrinkage factor that controls how aggressively each stage updates the model.- Boosting logic: each new tree focuses more on errors made by previous trees.
10.3 Workflow
flowchart LR
A[Read train/test parquet] --> B[Compute climatology and anomaly target]
B --> C[Create lag1/lag2 + calendar + exogenous features]
C --> D[Median imputation]
D --> E[Train AdaBoost regressor on fold train]
E --> F[Recursive validation rollout]
F --> G[OOF metrics and predictions]
G --> H[Refit on full train and produce test forecasts]flowchart LR A[Read train/test parquet] --> B[Compute climatology and anomaly target] B --> C[Create lag1/lag2 + calendar + exogenous features] C --> D[Median imputation] D --> E[Train AdaBoost regressor on fold train] E --> F[Recursive validation rollout] F --> G[OOF metrics and predictions] G --> H[Refit on full train and produce test forecasts]
10.4 Tuned Parameters and Effect
make_param_df(
[
("n_estimators", ad_cfg["n_estimators"], "Number of boosting stages."),
("learning_rate", ad_cfg["learning_rate"], "Shrinkage per stage; smaller needs more estimators."),
("max_depth", ad_cfg["max_depth"], "Depth of each base regression tree."),
("loss", ad_cfg["loss"], "Boosting loss that controls how hard examples are re-weighted."),
("lags", ad_cfg["lags"], "Autoregressive lag inputs for recursive forecasting."),
]
)| Parameter | Value | How it works | |
|---|---|---|---|
| 0 | n_estimators | 300 | Number of boosting stages. |
| 1 | learning_rate | 0.03 | Shrinkage per stage; smaller needs more estima... |
| 2 | max_depth | 3 | Depth of each base regression tree. |
| 3 | loss | linear | Boosting loss that controls how hard examples ... |
| 4 | lags | [1, 2] | Autoregressive lag inputs for recursive foreca... |
ad_cfg{'model': 'adaboost',
'target': 'sales_anom',
'features': ['lag1',
'lag2',
'week_of_year',
'month',
'is_holiday_int',
'temp_anom',
'fuel_anom',
'MarkDown1',
'MarkDown2',
'MarkDown3',
'MarkDown4',
'MarkDown5',
'CPI',
'Unemployment'],
'lags': [1, 2],
'n_estimators': 300,
'learning_rate': 0.03,
'max_depth': 3,
'loss': 'linear',
'n_folds': 4,
'val_weeks': 13,
'max_series': None,
'train_path': 'train_feat.parquet',
'test_path': 'test_feat.parquet'}
ad_fold[["fold", "train_start", "train_end", "val_start", "val_end", "wmae", "mae", "rmse", "runtime_sec"]]| fold | train_start | train_end | val_start | val_end | wmae | mae | rmse | runtime_sec | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 2011-10-28 | 2011-11-04 | 2012-01-27 | 2479.961992 | 2278.508843 | 4905.691398 | 144.954249 |
| 1 | 2 | 2010-02-05 | 2012-01-27 | 2012-02-03 | 2012-04-27 | 2197.511715 | 2189.320166 | 4409.023648 | 189.685843 |
| 2 | 3 | 2010-02-05 | 2012-04-27 | 2012-05-04 | 2012-07-27 | 2014.048705 | 2014.048705 | 3866.751795 | 214.710872 |
| 3 | 4 | 2010-02-05 | 2012-07-27 | 2012-08-03 | 2012-10-26 | 1865.985156 | 1843.294438 | 3739.207824 | 239.991155 |
ad_mean[["wmae", "mae", "rmse", "runtime_sec"]]wmae 2139.376892
mae 2081.293038
rmse 4230.168666
runtime_sec 197.33553
Name: 4, dtype: object
11 Final Comparative
11.1 Full-Data Comparison (Phases 1, 2, 3, 4, 5, 7 + Phase 1-lite Diagnostic)
full_comparison = pd.DataFrame(
[
{
"Model": "Local Linear Anomaly (Phase 1)",
"Mean WMAE": float(p1_mean["wmae"]),
"Mean MAE": float(p1_mean["mae"]),
"Mean RMSE": float(p1_mean["rmse"]),
},
{
"Model": "Structural AR (Phase 2)",
"Mean WMAE": float(p2_mean["wmae"]),
"Mean MAE": float(p2_mean["mae"]),
"Mean RMSE": float(p2_mean["rmse"]),
},
{
"Model": "Local Linear Lite (Phase 1-lite)",
"Mean WMAE": float(p1_lite_mean["wmae"]),
"Mean MAE": float(p1_lite_mean["mae"]),
"Mean RMSE": float(p1_lite_mean["rmse"]),
},
{
"Model": "Elastic Net (Phase 3)",
"Mean WMAE": float(en_mean["wmae"]),
"Mean MAE": float(en_mean["mae"]),
"Mean RMSE": float(en_mean["rmse"]),
},
{
"Model": "Random Forest (Phase 4)",
"Mean WMAE": float(rf_mean["wmae"]),
"Mean MAE": float(rf_mean["mae"]),
"Mean RMSE": float(rf_mean["rmse"]),
},
{
"Model": "ETS (Phase 5)",
"Mean WMAE": float(ets_mean["wmae"]),
"Mean MAE": float(ets_mean["mae"]),
"Mean RMSE": float(ets_mean["rmse"]),
},
{
"Model": "AdaBoost (Phase 7)",
"Mean WMAE": float(ad_mean["wmae"]),
"Mean MAE": float(ad_mean["mae"]),
"Mean RMSE": float(ad_mean["rmse"]),
},
]
).sort_values("Mean WMAE", ascending=True).reset_index(drop=True)
full_comparison| Model | Mean WMAE | Mean MAE | Mean RMSE | |
|---|---|---|---|---|
| 0 | ETS (Phase 5) | 1834.753547 | 1796.038600 | 3728.626262 |
| 1 | Random Forest (Phase 4) | 2057.750915 | 2006.732036 | 4130.493432 |
| 2 | Elastic Net (Phase 3) | 2132.272878 | 2082.729444 | 4195.010969 |
| 3 | AdaBoost (Phase 7) | 2139.376892 | 2081.293038 | 4230.168666 |
| 4 | Local Linear Lite (Phase 1-lite) | 2161.186467 | 2104.994105 | 4336.037305 |
| 5 | Structural AR (Phase 2) | 2187.483231 | 2146.618420 | 4189.028433 |
| 6 | Local Linear Anomaly (Phase 1) | 10075.781360 | 10206.203892 | 18921.817643 |
fig, ax = plt.subplots(figsize=(8, 4))
color_map = {
"Local Linear Anomaly (Phase 1)": "seagreen",
"Local Linear Lite (Phase 1-lite)": "darkgreen",
"Structural AR (Phase 2)": "firebrick",
"Elastic Net (Phase 3)": "mediumorchid",
"Random Forest (Phase 4)": "royalblue",
"ETS (Phase 5)": "teal",
"AdaBoost (Phase 7)": "darkorange",
}
bar_colors = [color_map[m] for m in full_comparison["Model"]]
ax.bar(
full_comparison["Model"],
full_comparison["Mean WMAE"],
color=bar_colors,
)
ax.set_title("Full-Data Mean WMAE (Lower is Better)")
ax.set_ylabel("Mean WMAE")
ax.grid(axis="y", alpha=0.25)
plt.xticks(rotation=10, ha="right")
plt.tight_layout()
plt.show()12 Conclusion
- Full-data comparison is now apples-to-apples for all six models.
- All tabular baselines share the same anomaly feature interface with
lag1andlag2. - The ranking table above is the reference for selecting the best model in this run.